On 01/31/2013 01:07 AM, Minchan Kim wrote: > On Tue, Jan 29, 2013 at 03:40:23PM -0600, Seth Jennings wrote: >> zswap is a thin compression backend for frontswap. It receives >> pages from frontswap and attempts to store them in a compressed >> memory pool, resulting in an effective partial memory reclaim and >> dramatically reduced swap device I/O. >> >> Additionally, in most cases, pages can be retrieved from this >> compressed store much more quickly than reading from tradition >> swap devices resulting in faster performance for many workloads. >> >> This patch adds the zswap driver to mm/ >> >> Signed-off-by: Seth Jennings <sjenning@xxxxxxxxxxxxxxxxxx> >> --- >> mm/Kconfig | 15 ++ >> mm/Makefile | 1 + >> mm/zswap.c | 656 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 3 files changed, 672 insertions(+) >> create mode 100644 mm/zswap.c >> >> diff --git a/mm/Kconfig b/mm/Kconfig >> index 278e3ab..14b9acb 100644 >> --- a/mm/Kconfig >> +++ b/mm/Kconfig >> @@ -446,3 +446,18 @@ config FRONTSWAP >> and swap data is stored as normal on the matching swap device. >> >> If unsure, say Y to enable frontswap. >> + >> +config ZSWAP >> + bool "In-kernel swap page compression" >> + depends on FRONTSWAP && CRYPTO >> + select CRYPTO_LZO >> + select ZSMALLOC > > Again, I'm asking why zswap should have a dependent on CRPYTO? > Couldn't we support it as a option? I'd like to use zswap without CRYPTO > like zram. The reason we need CRYPTO is that zswap uses it to support a pluggable compression model. zswap can use any compressor that has a crypto API driver. zswap has _symbol dependencies_ on CRYPTO. If it isn't selected, the build breaks. > >> + default n >> + help >> + Zswap is a backend for the frontswap mechanism in the VMM. >> + It receives pages from frontswap and attempts to store them >> + in a compressed memory pool, resulting in an effective >> + partial memory reclaim. In addition, pages and be retrieved >> + from this compressed store much faster than most tradition >> + swap devices resulting in reduced I/O and faster performance >> + for many workloads. >> diff --git a/mm/Makefile b/mm/Makefile >> index 3a46287..1b1ed5c 100644 >> --- a/mm/Makefile >> +++ b/mm/Makefile >> @@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o >> obj-$(CONFIG_BOUNCE) += bounce.o >> obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o >> obj-$(CONFIG_FRONTSWAP) += frontswap.o >> +obj-$(CONFIG_ZSWAP) += zswap.o >> obj-$(CONFIG_HAS_DMA) += dmapool.o >> obj-$(CONFIG_HUGETLBFS) += hugetlb.o >> obj-$(CONFIG_NUMA) += mempolicy.o >> diff --git a/mm/zswap.c b/mm/zswap.c >> new file mode 100644 >> index 0000000..a6c2928 >> --- /dev/null >> +++ b/mm/zswap.c >> @@ -0,0 +1,656 @@ >> +/* >> + * zswap-drv.c - zswap driver file >> + * >> + * zswap is a backend for frontswap that takes pages that are in the >> + * process of being swapped out and attempts to compress them and store >> + * them in a RAM-based memory pool. This results in a significant I/O >> + * reduction on the real swap device and, in the case of a slow swap >> + * device, can also improve workload performance. >> + * >> + * Copyright (C) 2012 Seth Jennings <sjenning@xxxxxxxxxxxxxxxxxx> >> + * >> + * This program is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU General Public License >> + * as published by the Free Software Foundation; either version 2 >> + * of the License, or (at your option) any later version. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> +*/ >> + >> +#include <linux/module.h> >> +#include <linux/cpu.h> >> +#include <linux/highmem.h> >> +#include <linux/slab.h> >> +#include <linux/spinlock.h> >> +#include <linux/types.h> >> +#include <linux/atomic.h> >> +#include <linux/frontswap.h> >> +#include <linux/rbtree.h> >> +#include <linux/swap.h> >> +#include <linux/crypto.h> >> +#include <linux/mempool.h> >> +#include <linux/zsmalloc.h> >> + >> +/********************************* >> +* statistics >> +**********************************/ >> +/* Number of memory pages used by the compressed pool */ >> +static atomic_t zswap_pool_pages = ATOMIC_INIT(0); >> +/* The number of compressed pages currently stored in zswap */ >> +static atomic_t zswap_stored_pages = ATOMIC_INIT(0); > > Just nitpicking. > > It seems future zsmalloc users want to gather information > like this so it might be nice to add them into zsmalloc's > internal like malloc_stats(2) > >> + >> +/* >> + * The statistics below are not protected from concurrent access for >> + * performance reasons so they may not be a 100% accurate. However, >> + * the do provide useful information on roughly how many times a >> + * certain event is occurring. >> +*/ >> +static u64 zswap_pool_limit_hit; >> +static u64 zswap_reject_compress_poor; >> +static u64 zswap_reject_zsmalloc_fail; >> +static u64 zswap_reject_kmemcache_fail; >> +static u64 zswap_duplicate_entry; >> + >> +/********************************* >> +* tunables >> +**********************************/ >> +/* Enable/disable zswap (disabled by default, fixed at boot for now) */ >> +static bool zswap_enabled; >> +module_param_named(enabled, zswap_enabled, bool, 0); >> + >> +/* Compressor to be used by zswap (fixed at boot for now) */ >> +#define ZSWAP_COMPRESSOR_DEFAULT "lzo" >> +static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; >> +module_param_named(compressor, zswap_compressor, charp, 0); >> + >> +/* The maximum percentage of memory that the compressed pool can occupy */ >> +static unsigned int zswap_max_pool_percent = 20; >> +module_param_named(max_pool_percent, >> + zswap_max_pool_percent, uint, 0644); >> + >> +/* >> + * Maximum compression ratio, as as percentage, for an acceptable >> + * compressed page. Any pages that do not compress by at least >> + * this ratio will be rejected. >> +*/ >> +static unsigned int zswap_max_compression_ratio = 80; >> +module_param_named(max_compression_ratio, >> + zswap_max_compression_ratio, uint, 0644); >> + >> +/********************************* >> +* compression functions >> +**********************************/ >> +/* per-cpu compression transforms */ >> +static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; >> + >> +enum comp_op { >> + ZSWAP_COMPOP_COMPRESS, >> + ZSWAP_COMPOP_DECOMPRESS >> +}; >> + >> +static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, >> + u8 *dst, unsigned int *dlen) >> +{ >> + struct crypto_comp *tfm; >> + int ret; >> + >> + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); >> + switch (op) { >> + case ZSWAP_COMPOP_COMPRESS: >> + ret = crypto_comp_compress(tfm, src, slen, dst, dlen); See here. crypto_comp_compress() is a symbolic dependency on CRYPTO >> + break; >> + case ZSWAP_COMPOP_DECOMPRESS: >> + ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); And here. >> + break; >> + default: >> + ret = -EINVAL; >> + } >> + >> + put_cpu(); >> + return ret; >> +} >> + >> +static int __init zswap_comp_init(void) >> +{ >> + if (!crypto_has_comp(zswap_compressor, 0, 0)) { And here. >> + pr_info("zswap: %s compressor not available\n", >> + zswap_compressor); >> + /* fall back to default compressor */ >> + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; >> + if (!crypto_has_comp(zswap_compressor, 0, 0)) >> + /* can't even load the default compressor */ >> + return -ENODEV; >> + } >> + pr_info("zswap: using %s compressor\n", zswap_compressor); >> + >> + /* alloc percpu transforms */ >> + zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); >> + if (!zswap_comp_pcpu_tfms) >> + return -ENOMEM; >> + return 0; >> +} >> + >> +static void zswap_comp_exit(void) >> +{ >> + /* free percpu transforms */ >> + if (zswap_comp_pcpu_tfms) >> + free_percpu(zswap_comp_pcpu_tfms); >> +} >> + >> +/********************************* >> +* data structures >> +**********************************/ >> +struct zswap_entry { >> + struct rb_node rbnode; >> + unsigned type; >> + pgoff_t offset; >> + unsigned long handle; >> + unsigned int length; >> +}; >> + >> +struct zswap_tree { >> + struct rb_root rbroot; >> + spinlock_t lock; >> + struct zs_pool *pool; >> +}; >> + >> +static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; >> + >> +/********************************* >> +* zswap entry functions >> +**********************************/ >> +#define ZSWAP_KMEM_CACHE_NAME "zswap_entry_cache" >> +static struct kmem_cache *zswap_entry_cache; >> + >> +static inline int zswap_entry_cache_create(void) >> +{ >> + zswap_entry_cache = >> + kmem_cache_create(ZSWAP_KMEM_CACHE_NAME, >> + sizeof(struct zswap_entry), 0, 0, NULL); >> + return (zswap_entry_cache == NULL); >> +} >> + >> +static inline void zswap_entry_cache_destory(void) >> +{ >> + kmem_cache_destroy(zswap_entry_cache); >> +} >> + >> +static inline struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) >> +{ >> + struct zswap_entry *entry; >> + entry = kmem_cache_alloc(zswap_entry_cache, gfp); >> + if (!entry) >> + return NULL; >> + return entry; >> +} >> + >> +static inline void zswap_entry_cache_free(struct zswap_entry *entry) >> +{ >> + kmem_cache_free(zswap_entry_cache, entry); >> +} >> + >> +/********************************* >> +* rbtree functions >> +**********************************/ >> +static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) >> +{ >> + struct rb_node *node = root->rb_node; >> + struct zswap_entry *entry; >> + >> + while (node) { >> + entry = rb_entry(node, struct zswap_entry, rbnode); >> + if (entry->offset > offset) >> + node = node->rb_left; >> + else if (entry->offset < offset) >> + node = node->rb_right; >> + else >> + return entry; >> + } >> + return NULL; >> +} >> + >> +/* >> + * In the case that a entry with the same offset is found, it a pointer to >> + * the existing entry is stored in dupentry and the function returns -EEXIST >> +*/ >> +static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, >> + struct zswap_entry **dupentry) >> +{ >> + struct rb_node **link = &root->rb_node, *parent = NULL; >> + struct zswap_entry *myentry; >> + >> + while (*link) { >> + parent = *link; >> + myentry = rb_entry(parent, struct zswap_entry, rbnode); >> + if (myentry->offset > entry->offset) >> + link = &(*link)->rb_left; >> + else if (myentry->offset < entry->offset) >> + link = &(*link)->rb_right; >> + else { >> + *dupentry = myentry; >> + return -EEXIST; > > I sent a mail to frontswap guys to remove existing entry handling. > The frontswap_store can be called on existing entry if reuse_swap_page > encounters writeback-ing page. Let's see how it is going. Thanks, It'd be nice to do away with this nastiness. > >> + } >> + } >> + rb_link_node(&entry->rbnode, parent, link); >> + rb_insert_color(&entry->rbnode, root); >> + return 0; >> +} >> + >> +/********************************* >> +* per-cpu code >> +**********************************/ >> +static DEFINE_PER_CPU(u8 *, zswap_dstmem); >> + >> +static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) >> +{ >> + struct crypto_comp *tfm; >> + u8 *dst; >> + >> + switch (action) { >> + case CPU_UP_PREPARE: >> + tfm = crypto_alloc_comp(zswap_compressor, 0, 0); >> + if (IS_ERR(tfm)) { >> + pr_err("zswap: can't allocate compressor transform\n"); >> + return NOTIFY_BAD; >> + } >> + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; >> + dst = (u8 *)__get_free_pages(GFP_KERNEL, 1); >> + if (!dst) { >> + pr_err("zswap: can't allocate compressor buffer\n"); >> + crypto_free_comp(tfm); >> + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; >> + return NOTIFY_BAD; >> + } >> + per_cpu(zswap_dstmem, cpu) = dst; >> + break; >> + case CPU_DEAD: >> + case CPU_UP_CANCELED: >> + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); >> + if (tfm) { >> + crypto_free_comp(tfm); >> + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; >> + } >> + dst = per_cpu(zswap_dstmem, cpu); >> + if (dst) { >> + free_pages((unsigned long)dst, 1); >> + per_cpu(zswap_dstmem, cpu) = NULL; >> + } >> + break; >> + default: >> + break; >> + } >> + return NOTIFY_OK; >> +} >> + >> +static int zswap_cpu_notifier(struct notifier_block *nb, >> + unsigned long action, void *pcpu) >> +{ >> + unsigned long cpu = (unsigned long)pcpu; >> + return __zswap_cpu_notifier(action, cpu); >> +} >> + >> +static struct notifier_block zswap_cpu_notifier_block = { >> + .notifier_call = zswap_cpu_notifier >> +}; >> + >> +static int zswap_cpu_init(void) >> +{ >> + unsigned long cpu; >> + >> + get_online_cpus(); >> + for_each_online_cpu(cpu) >> + if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) >> + goto cleanup; >> + register_cpu_notifier(&zswap_cpu_notifier_block); >> + put_online_cpus(); >> + return 0; >> + >> +cleanup: >> + for_each_online_cpu(cpu) >> + __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); >> + put_online_cpus(); >> + return -ENOMEM; >> +} >> + >> +/********************************* >> +* zsmalloc callbacks >> +**********************************/ >> +static mempool_t *zswap_page_pool; >> + >> +static inline unsigned int zswap_max_pool_pages(void) >> +{ >> + return zswap_max_pool_percent * totalram_pages / 100; >> +} >> + >> +static inline int zswap_page_pool_create(void) >> +{ >> + /* TODO: dynamically size mempool */ >> + zswap_page_pool = mempool_create_page_pool(256, 0); >> + if (!zswap_page_pool) >> + return -ENOMEM; >> + return 0; >> +} >> + >> +static inline void zswap_page_pool_destroy(void) >> +{ >> + mempool_destroy(zswap_page_pool); >> +} >> + >> +static struct page *zswap_alloc_page(gfp_t flags) >> +{ >> + struct page *page; >> + >> + if (atomic_read(&zswap_pool_pages) >= zswap_max_pool_pages()) { >> + zswap_pool_limit_hit++; >> + return NULL; >> + } >> + page = mempool_alloc(zswap_page_pool, flags); >> + if (page) >> + atomic_inc(&zswap_pool_pages); >> + return page; >> +} >> + >> +static void zswap_free_page(struct page *page) >> +{ >> + if (!page) >> + return; >> + mempool_free(page, zswap_page_pool); >> + atomic_dec(&zswap_pool_pages); >> +} >> + >> +static struct zs_ops zswap_zs_ops = { >> + .alloc = zswap_alloc_page, >> + .free = zswap_free_page >> +}; >> + >> +/********************************* >> +* frontswap hooks >> +**********************************/ >> +/* attempts to compress and store an single page */ >> +static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct page *page) >> +{ >> + struct zswap_tree *tree = zswap_trees[type]; >> + struct zswap_entry *entry, *dupentry; >> + int ret; >> + unsigned int dlen = PAGE_SIZE; >> + unsigned long handle; >> + char *buf; >> + u8 *src, *dst; >> + >> + if (!tree) { > > We should fix frontswap_init can return error code instead of > checking validation of init everytime. > I will fix it after Konrad merged eariler frontswap patch for > frontswap_init call out of swap_lock. That would also be better. > >> + ret = -ENODEV; >> + goto reject; >> + } >> + >> + /* compress */ >> + dst = get_cpu_var(zswap_dstmem); >> + src = kmap_atomic(page); >> + ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); >> + kunmap_atomic(src); >> + if (ret) { >> + ret = -EINVAL; >> + goto putcpu; >> + } >> + if ((dlen * 100 / PAGE_SIZE) > zswap_max_compression_ratio) { >> + zswap_reject_compress_poor++; >> + ret = -E2BIG; >> + goto putcpu; >> + } >> + >> + /* store */ >> + handle = zs_malloc(tree->pool, dlen, >> + __GFP_NORETRY | __GFP_HIGHMEM | __GFP_NOMEMALLOC | >> + __GFP_NOWARN); >> + if (!handle) { >> + zswap_reject_zsmalloc_fail++; >> + ret = -ENOMEM; >> + goto putcpu; >> + } >> + >> + buf = zs_map_object(tree->pool, handle, ZS_MM_WO); >> + memcpy(buf, dst, dlen); >> + zs_unmap_object(tree->pool, handle); >> + put_cpu_var(zswap_dstmem); >> + >> + /* allocate entry */ >> + entry = zswap_entry_cache_alloc(GFP_KERNEL); >> + if (!entry) { >> + zs_free(tree->pool, handle); >> + zswap_reject_kmemcache_fail++; >> + ret = -ENOMEM; >> + goto reject; >> + } >> + >> + /* populate entry */ >> + entry->type = type; >> + entry->offset = offset; >> + entry->handle = handle; >> + entry->length = dlen; >> + >> + /* map */ >> + spin_lock(&tree->lock); >> + do { >> + ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); >> + if (ret == -EEXIST) { >> + zswap_duplicate_entry++; >> + >> + /* remove from rbtree */ >> + rb_erase(&dupentry->rbnode, &tree->rbroot); >> + > > While spaces damaged. Doh. Will fix. > Will continue to review tomorrow. Thanks! Seth -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>