This is used to replace previous zram. zram users can enable this feature, then a pseudo device will be created automaticlly after kernel boot. Just using "mkswp /dev/zram0; swapon /dev/zram0" to use it as a swap disk. The size of this pseudeo is controlled by zswap boot parameter zswap.max_pool_percent. disksize = (totalram_pages * zswap.max_pool_percent/100)*PAGE_SIZE. Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx> --- mm/Kconfig | 12 ++++ mm/zswap.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 208 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index d80a575..3778026 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -525,6 +525,18 @@ choice be refused unless frontswap_get happened and freed some space. endchoice +config ZSWAP_PSEUDO_BLKDEV + bool "Emulate a pseudo blk-dev based on zswap(previous zram)" + depends on ZSWAP && ZSMALLOC + default n + + help + Enable this option will emulate a pseudo block swapdev /dev/zram0 + with size zswap.max_pool_percent of total ram size. All writes to this + block device will be compressed and cached by zswap as a result no + real IO disk operations will happen. + This feature can be used to replace drivers/staging/zram. + config MEM_SOFT_DIRTY bool "Track memory changes" depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY diff --git a/mm/zswap.c b/mm/zswap.c index 8e8dc99..ae73c9d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -38,6 +38,11 @@ #include <linux/zbud.h> #else #include <linux/zsmalloc.h> +#ifdef CONFIG_ZSWAP_PSEUDO_BLKDEV +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/genhd.h> +#endif #endif #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -968,6 +973,189 @@ static int __init zswap_debugfs_init(void) static void __exit zswap_debugfs_exit(void) { } #endif +#ifdef CONFIG_ZSWAP_PSEUDO_BLKDEV +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1 << SECTOR_SHIFT) +#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) + +struct zram { + struct rw_semaphore lock; /* protect concurent reads and writes */ + struct request_queue *queue; + struct gendisk *disk; + + /* + * This is the disk size for userland. The size is controlled by + * boot parameter zswap.max_pool_percent. + * disksize = (totalram_pages * zswap.max_pool_percent/100)*PAGE_SIZE + */ + u64 disksize; /* bytes */ + + /* + * This page is used to store real data for /dev/zram. + * Meanful operation to /dev/zramx is only mkswp and swapon/swapoff. + * So use one page to store the real data(written by mkswp). + */ + struct page *metapage; +}; + +/* + * Only create /dev/zram0, can be extened in future if there is real uercases + * need multiple zram devices. + */ +static struct zram zram_device; +static const struct block_device_operations zram_devops = { + .owner = THIS_MODULE +}; + +static void update_position(u32 *index, int *offset, struct bio_vec *bvec) +{ + if (*offset + bvec->bv_len >= PAGE_SIZE) + (*index)++; + *offset = (*offset + bvec->bv_len) % PAGE_SIZE; +} + +static void zram_make_request(struct request_queue *queue, struct bio *bio) +{ + u32 index; + struct bio_vec *bvec; + unsigned char *src, *dst; + int offset, i, rw = bio_data_dir(bio); + struct zram *zram = queue->queuedata; + + index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; + offset = (bio->bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; + + bio_for_each_segment(bvec, bio, i) { + /* + * The only operation to pseudo /dev/zramx is mkswp and + * swapon/swapoff, so we only need one extra page to store the + * real meta data! + */ + BUG_ON(bvec->bv_len != PAGE_SIZE); + BUG_ON(offset); + + if (!index) { + if (rw == READ) { + down_read(&zram->lock); + dst = kmap_atomic(bvec->bv_page); + src = kmap_atomic(zram->metapage); + memcpy(dst, src, bvec->bv_len); + kunmap_atomic(dst); + kunmap_atomic(src); + flush_dcache_page(bvec->bv_page); + up_read(&zram->lock); + } else { + down_write(&zram->lock); + src = kmap_atomic(bvec->bv_page); + dst = kmap_atomic(zram->metapage); + memcpy(dst, src, bvec->bv_len); + kunmap_atomic(dst); + kunmap_atomic(src); + up_write(&zram->lock); + } + } + update_position(&index, &offset, bvec); + } + set_bit(BIO_UPTODATE, &bio->bi_flags); + bio_endio(bio, 0); + return; +} + +static int create_zram_device(struct zram *zram, int major, int device_id) +{ + int ret = -ENOMEM; + u64 disksize; + + zram->queue = blk_alloc_queue(GFP_KERNEL); + if (!zram->queue) { + pr_err("Error allocating disk queue for device%d\n", device_id); + goto out; + } + + blk_queue_make_request(zram->queue, zram_make_request); + zram->queue->queuedata = zram; + + /* gendisk structure */ + zram->disk = alloc_disk(1); + if (!zram->disk) { + pr_warn("Error allocating disk structure for device %d\n", + device_id); + goto out_free_queue; + } + + zram->disk->major = major; + zram->disk->first_minor = device_id; + zram->disk->fops = &zram_devops; + zram->disk->queue = zram->queue; + snprintf(zram->disk->disk_name, 16, "zram%d", device_id); + + /* + * To ensure that we always get PAGE_SIZE aligned + * and n*PAGE_SIZED sized I/O requests. + */ + blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE); + blk_queue_logical_block_size(zram->disk->queue, 1<<12); + blk_queue_io_min(zram->disk->queue, PAGE_SIZE); + blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); + + add_disk(zram->disk); + + /* Init blk-dev */ + disksize = totalram_pages * zswap_max_pool_percent / 100; + disksize *= PAGE_SIZE; + disksize = PAGE_ALIGN(disksize); + zram->disksize = disksize; + set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + + /* zram devices sort of resembles non-rotational disks */ + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); + + zram->metapage = alloc_page(GFP_KERNEL); + if (!zram->metapage) + goto out_free_disk; + + pr_debug("Initialization done!\n"); + return 0; + +out_free_disk: + pr_debug("Init zram meta pages fail!\n"); + del_gendisk(zram->disk); + put_disk(zram->disk); +out_free_queue: + blk_cleanup_queue(zram->queue); +out: + return ret; +} + +static int zswap_blkdev_init(void) +{ + int major, ret = 0; + + major = register_blkdev(0, "zram"); + if (major <= 0) { + pr_warn("Unable to get major number\n"); + ret = -EBUSY; + goto out; + } + + ret = create_zram_device(&zram_device, major, 0); + if (ret) { + unregister_blkdev(major, "zram"); + goto out; + } + + pr_info("Created zram device(%d, %d).\n", major, 0); +out: + return ret; +} +#else +static int zswap_blkdev_init(void) +{ + return 0; +} +#endif + /********************************* * module init and exit **********************************/ @@ -989,9 +1177,17 @@ static int __init init_zswap(void) pr_err("per-cpu initialization failed\n"); goto pcpufail; } + + if (IS_ENABLED(CONFIG_ZSWAP_PSEUDO_BLKDEV)) + if (zswap_blkdev_init()) { + pr_err("emulate blk device failed\n"); + goto pcpufail; + } + frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); + return 0; pcpufail: zswap_comp_exit(); -- 1.7.10.4 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>