From: Zhi Yong Wu <wuzhy@xxxxxxxxxxxxxxxxxx> This patch adds read/write code paths: include read_pages(), do_writepages(), do_generic_file_read() and __blockdev_direct_IO() to record heat information. When real disk i/o for an inode is done, its own hot_inode_item will be created or updated in the RB tree for the filesystem, and the i/o freq for all of its extents will also be created/updated in the RB-tree per inode. Each of the two structures hot_inode_item and hot_range_item contains a hot_freq_data struct with its frequency of access metrics (number of {reads, writes}, last {read,write} time, frequency of {reads,writes}). Each hot_inode_item contains one hot_range_tree struct which is keyed by {inode, offset, length} and used to keep track of all the ranges in this file. Signed-off-by: Chandra Seetharaman <sekharan@xxxxxxxxxx> Signed-off-by: Zhi Yong Wu <wuzhy@xxxxxxxxxxxxxxxxxx> --- fs/hot_tracking.c | 259 +++++++++++++++++++++++++++++++++++++++++++ fs/hot_tracking.h | 3 + fs/namei.c | 4 + include/linux/hot_tracking.h | 19 ++++ mm/filemap.c | 24 +++- mm/readahead.c | 6 + 6 files changed, 313 insertions(+), 2 deletions(-) diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c index 25e7858..d68c458 100644 --- a/fs/hot_tracking.c +++ b/fs/hot_tracking.c @@ -22,6 +22,8 @@ static void hot_range_item_init(struct hot_range_item *hr, struct hot_inode_item *he, loff_t start) { kref_init(&hr->refs); + hr->freq.avg_delta_reads = (u64) -1; + hr->freq.avg_delta_writes = (u64) -1; hr->start = start; hr->len = 1 << RANGE_BITS; hr->hot_inode = he; @@ -59,6 +61,62 @@ static void hot_range_item_put(struct hot_range_item *hr) kref_put(&hr->refs, hot_range_item_free); } +static struct hot_range_item +*hot_range_item_alloc(struct hot_inode_item *he, loff_t start) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct hot_range_item *hr, *hr_new = NULL; + + start = start << RANGE_BITS; + + /* walk tree to find insertion point */ +redo: + spin_lock(&he->i_lock); + p = &he->hot_range_tree.rb_node; + while (*p) { + parent = *p; + hr = rb_entry(parent, struct hot_range_item, rb_node); + if (start < hr->start) + p = &(*p)->rb_left; + else if (start > (hr->start + hr->len - 1)) + p = &(*p)->rb_right; + else { + hot_range_item_get(hr); + if (hr_new) { + /* + * Lost the race. Somebody else inserted + * the item for the range. Free the + * newly allocated item. + */ + kmem_cache_free(hot_range_item_cachep, hr_new); + } + spin_unlock(&he->i_lock); + + return hr; + } + } + + if (hr_new) { + rb_link_node(&hr_new->rb_node, parent, p); + rb_insert_color(&hr_new->rb_node, &he->hot_range_tree); + hot_range_item_get(hr_new); /* For the caller */ + spin_unlock(&he->i_lock); + return hr_new; + } + spin_unlock(&he->i_lock); + + hr_new = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS); + if (!hr_new) + return ERR_PTR(-ENOMEM); + + hot_range_item_init(hr_new, he, start); + + cond_resched(); + + goto redo; +} + /* * Free the entire hot_range_tree. */ @@ -82,6 +140,8 @@ static void hot_inode_item_init(struct hot_inode_item *he, struct hot_info *root, u64 ino) { kref_init(&he->refs); + he->freq.avg_delta_reads = (u64) -1; + he->freq.avg_delta_writes = (u64) -1; he->ino = ino; he->hot_root = root; spin_lock_init(&he->i_lock); @@ -120,6 +180,153 @@ void hot_inode_item_put(struct hot_inode_item *he) kref_put(&he->refs, hot_inode_item_free); } +static struct hot_inode_item +*hot_inode_item_alloc(struct hot_info *root, u64 ino) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct hot_inode_item *he, *he_new = NULL; + + /* walk tree to find insertion point */ +redo: + spin_lock(&root->t_lock); + p = &root->hot_inode_tree.rb_node; + while (*p) { + parent = *p; + he = rb_entry(parent, struct hot_inode_item, rb_node); + if (ino < he->ino) + p = &(*p)->rb_left; + else if (ino > he->ino) + p = &(*p)->rb_right; + else { + hot_inode_item_get(he); + if (he_new) { + /* + * Lost the race. Somebody else inserted + * the item for the inode. Free the + * newly allocated item. + */ + kmem_cache_free(hot_inode_item_cachep, he_new); + } + spin_unlock(&root->t_lock); + + return he; + } + } + + if (he_new) { + rb_link_node(&he_new->rb_node, parent, p); + rb_insert_color(&he_new->rb_node, &root->hot_inode_tree); + hot_inode_item_get(he_new); /* For the caller */ + spin_unlock(&root->t_lock); + return he_new; + } + spin_unlock(&root->t_lock); + + he_new = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS); + if (!he_new) + return ERR_PTR(-ENOMEM); + + hot_inode_item_init(he_new, root, ino); + + cond_resched(); + + goto redo; +} + +struct hot_inode_item +*hot_inode_item_lookup(struct hot_info *root, u64 ino) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct hot_inode_item *he; + + /* walk tree to find insertion point */ + spin_lock(&root->t_lock); + p = &root->hot_inode_tree.rb_node; + while (*p) { + parent = *p; + he = rb_entry(parent, struct hot_inode_item, rb_node); + if (ino < he->ino) + p = &(*p)->rb_left; + else if (ino > he->ino) + p = &(*p)->rb_right; + else { + hot_inode_item_get(he); + spin_unlock(&root->t_lock); + + return he; + } + } + spin_unlock(&root->t_lock); + + return ERR_PTR(-ENOENT); +} + +void hot_inode_item_unlink(struct inode *inode) +{ + struct hot_info *root = inode->i_sb->s_hot_root; + struct hot_inode_item *he; + + if (!(inode->i_sb->s_flags & MS_HOTTRACK) + || !S_ISREG(inode->i_mode)) + return; + + he = hot_inode_item_lookup(root, inode->i_ino); + if (IS_ERR(he)) + return; + + spin_lock(&root->t_lock); + hot_inode_item_put(he); + hot_inode_item_put(he); /* For the caller */ + spin_unlock(&root->t_lock); +} + +/* + * This function does the actual work of updating + * the frequency numbers. + * + * avg_delta_{reads,writes} are indeed a kind of simple moving + * average of the time difference between each of the last + * 2^(FREQ_POWER) reads/writes. If there have not yet been that + * many reads or writes, it's likely that the values will be very + * large; They are initialized to the largest possible value for the + * data type. Simply, we don't want a few fast access to a file to + * automatically make it appear very hot. + */ +static void hot_freq_calc(struct timespec old_atime, + struct timespec cur_time, u64 *avg) +{ + struct timespec delta_ts; + u64 new_delta; + + delta_ts = timespec_sub(cur_time, old_atime); + new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER; + + *avg = (*avg << FREQ_POWER) - *avg + new_delta; + *avg = *avg >> FREQ_POWER; +} + +static void hot_freq_update(struct hot_info *root, + struct hot_freq *freq, bool write) +{ + struct timespec cur_time = current_kernel_time(); + + if (write) { + freq->nr_writes += 1; + hot_freq_calc(freq->last_write_time, + cur_time, + &freq->avg_delta_writes); + freq->last_write_time = cur_time; + } else { + freq->nr_reads += 1; + hot_freq_calc(freq->last_read_time, + cur_time, + &freq->avg_delta_reads); + freq->last_read_time = cur_time; + } +} + /* * Initialize kmem cache for hot_inode_item and hot_range_item. */ @@ -136,6 +343,58 @@ void __init hot_cache_init(void) kmem_cache_destroy(hot_inode_item_cachep); } +/* + * Main function to update i/o access frequencies, and it will be called + * from read/writepages() hooks, which are read_pages(), do_writepages(), + * do_generic_file_read(), and __blockdev_direct_IO(). + */ +inline void hot_freqs_update(struct inode *inode, loff_t start, + size_t len, int rw) +{ + struct hot_info *root = inode->i_sb->s_hot_root; + struct hot_inode_item *he; + struct hot_range_item *hr; + u64 range_size; + loff_t cur, end; + + if (!(inode->i_sb->s_flags & MS_HOTTRACK) || (len == 0) + || !S_ISREG(inode->i_mode) || !inode->i_nlink) + return; + + he = hot_inode_item_alloc(root, inode->i_ino); + if (IS_ERR(he)) + return; + + hot_freq_update(root, &he->freq, rw); + + /* + * Align ranges on range size boundary + * to prevent proliferation of range structs + */ + range_size = 1 << RANGE_BITS; + end = (start + len + range_size - 1) >> RANGE_BITS; + cur = start >> RANGE_BITS; + for (; cur < end; cur++) { + hr = hot_range_item_alloc(he, cur); + if (IS_ERR(hr)) { + WARN(1, "hot_range_item_alloc returns %ld\n", + PTR_ERR(hr)); + return; + } + + hot_freq_update(root, &hr->freq, rw); + + spin_lock(&he->i_lock); + hot_range_item_put(hr); + spin_unlock(&he->i_lock); + } + + spin_lock(&root->t_lock); + hot_inode_item_put(he); + spin_unlock(&root->t_lock); +} +EXPORT_SYMBOL(hot_freqs_update); + static struct hot_info *hot_tree_init(struct super_block *sb) { struct hot_info *root; diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h index 51d829e..56ab13c 100644 --- a/fs/hot_tracking.h +++ b/fs/hot_tracking.h @@ -16,8 +16,11 @@ /* size of sub-file ranges */ #define RANGE_BITS 20 +#define FREQ_POWER 4 void __init hot_cache_init(void); void hot_inode_item_put(struct hot_inode_item *he); +struct hot_inode_item *hot_inode_item_lookup(struct hot_info *root, u64 ino); +void hot_inode_item_unlink(struct inode *inode); #endif /* __HOT_TRACKING__ */ diff --git a/fs/namei.c b/fs/namei.c index caa2805..e50af1e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -38,6 +38,7 @@ #include "internal.h" #include "mount.h" +#include "hot_tracking.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) @@ -3668,6 +3669,9 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) dont_mount(dentry); } } + + if (!error && !dentry->d_inode->i_nlink) + hot_inode_item_unlink(dentry->d_inode); mutex_unlock(&dentry->d_inode->i_mutex); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h index 91633db..5f02025 100644 --- a/include/linux/hot_tracking.h +++ b/include/linux/hot_tracking.h @@ -31,8 +31,24 @@ enum { MAX_TYPES, }; +/* + * A frequency data struct holds values that are used to + * determine temperature of files and file ranges. These structs + * are members of hot_inode_item and hot_range_item + */ +struct hot_freq { + struct timespec last_read_time; + struct timespec last_write_time; + u32 nr_reads; + u32 nr_writes; + u64 avg_delta_reads; + u64 avg_delta_writes; + u32 last_temp; +}; + /* An item representing an inode and its access frequency */ struct hot_inode_item { + struct hot_freq freq; /* frequency data */ struct kref refs; struct rb_node rb_node; /* rbtree index */ struct rcu_head rcu; @@ -47,6 +63,7 @@ struct hot_inode_item { * an inode whose frequency is being tracked */ struct hot_range_item { + struct hot_freq freq; /* frequency data */ struct kref refs; struct rb_node rb_node; /* rbtree index */ struct rcu_head rcu; @@ -62,5 +79,7 @@ struct hot_info { extern int hot_track_init(struct super_block *sb); extern void hot_track_exit(struct super_block *sb); +extern void hot_freqs_update(struct inode *inode, loff_t start, + size_t len, int rw); #endif /* _LINUX_HOTTRACK_H */ diff --git a/mm/filemap.c b/mm/filemap.c index ae4846f..d70939d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> #include <linux/cleancache.h> +#include <linux/hot_tracking.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -1196,6 +1197,10 @@ page_ok: mark_page_accessed(page); prev_index = index; + /* Hot tracking */ + hot_freqs_update(inode, page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... @@ -1514,9 +1519,13 @@ static int page_cache_read(struct file *file, pgoff_t offset) return -ENOMEM; ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); - if (ret == 0) + if (ret == 0) { + /* Hot tracking */ + hot_freqs_update(mapping->host, + page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); ret = mapping->a_ops->readpage(file, page); - else if (ret == -EEXIST) + } else if (ret == -EEXIST) ret = 0; /* losing race to add is OK */ page_cache_release(page); @@ -1711,6 +1720,11 @@ page_not_uptodate: * and we need to check for errors. */ ClearPageError(page); + + /* Hot tracking */ + hot_freqs_update(inode, page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + error = mapping->a_ops->readpage(file, page); if (!error) { wait_on_page_locked(page); @@ -2249,6 +2263,9 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } if (written > 0) { + /* Hot tracking */ + hot_freqs_update(inode, pos, written, 1); + pos += written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { i_size_write(inode, pos); @@ -2404,6 +2421,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, status = generic_perform_write(file, &i, pos); if (likely(status >= 0)) { + /* Hot tracking */ + hot_freqs_update(file_inode(file), pos, status, 1); + written += status; *ppos = pos + status; } diff --git a/mm/readahead.c b/mm/readahead.c index e4ed041..51f0e88 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -19,6 +19,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/file.h> +#include <linux/hot_tracking.h> /* * Initialise a struct file's readahead state. Assumes that the caller has @@ -115,6 +116,11 @@ static int read_pages(struct address_space *mapping, struct file *filp, unsigned page_idx; int ret; + /* Hot tracking */ + hot_freqs_update(mapping->host, + list_to_page(pages)->index << PAGE_CACHE_SHIFT, + (size_t)nr_pages * PAGE_CACHE_SIZE, 0); + blk_start_plug(&plug); if (mapping->a_ops->readpages) { -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html