Hi, Nitin. Sorry for late review. On Mon, 2010-05-24 at 19:48 +0530, Nitin Gupta wrote: > Currently, ramzwap devices (/dev/ramzswapX) can only > be used as swap disks since it was hard-coded to consider > only the first request in bio vector. > > Now, we iterate over all the segments in an incoming > bio which allows us to handle all kinds of I/O requests. > > ramzswap devices can still handle PAGE_SIZE aligned and > multiple of PAGE_SIZE sized I/O requests only. To ensure > that we get always get such requests only, we set following > request_queue attributes to PAGE_SIZE: > - physical_block_size > - logical_block_size > - io_min > - io_opt > > Note: physical and logical block sizes were already set > equal to PAGE_SIZE and that seems to be sufficient to get > PAGE_SIZE aligned I/O. > > Since we are no longer limited to handling swap requests > only, the next few patches rename ramzswap to zram. So, > the devices will then be called /dev/zram{0, 1, 2, ...} > > Usage/Examples: > 1) Use as /tmp storage > - mkfs.ext4 /dev/zram0 > - mount /dev/zram0 /tmp > > 2) Use as swap: > - mkswap /dev/zram0 > - swapon /dev/zram0 -p 10 # give highest priority to zram0 > > Performance: > > - I/O benchamark done with 'dd' command. Details can be > found here: > http://code.google.com/p/compcache/wiki/zramperf > Summary: > - Maximum read speed (approx): > - ram disk: 1200 MB/sec > - zram disk: 600 MB/sec > - Maximum write speed (approx): > - ram disk: 500 MB/sec > - zram disk: 160 MB/sec > > Issues: > > - Double caching: We can potentially waste memory by having > two copies of a page -- one in page cache (uncompress) and > second in the device memory (compressed). However, during > reclaim, clean page cache pages are quickly freed, so this > does not seem to be a big problem. > > - Stale data: Not all filesystems support issuing 'discard' > requests to underlying block devices. So, if such filesystems > are used over zram devices, we can accumulate lot of stale > data in memory. Even for filesystems to do support discard > (example, ext4), we need to see how effective it is. > > - Scalability: There is only one (per-device) de/compression > buffer stats. This can lead to significant contention, especially > when used for generic (non-swap) purposes. Later, we could enhance it by per-cpu counter. > > Signed-off-by: Nitin Gupta <ngupta@xxxxxxxxxx> > --- > --- > drivers/staging/ramzswap/ramzswap_drv.c | 325 ++++++++++++++----------------- > 1 files changed, 148 insertions(+), 177 deletions(-) > > diff --git a/drivers/staging/ramzswap/ramzswap_drv.c b/drivers/staging/ramzswap/ramzswap_drv.c > index d14bf91..9d20d23 100644 > --- a/drivers/staging/ramzswap/ramzswap_drv.c > +++ b/drivers/staging/ramzswap/ramzswap_drv.c > @@ -101,20 +101,6 @@ static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes) > rzs->disksize &= PAGE_MASK; > } > > -/* > - * Swap header (1st page of swap device) contains information > - * about a swap file/partition. Prepare such a header for the > - * given ramzswap device so that swapon can identify it as a > - * swap partition. > - */ > -static void setup_swap_header(struct ramzswap *rzs, union swap_header *s) > -{ > - s->info.version = 1; > - s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1; > - s->info.nr_badpages = 0; > - memcpy(s->magic.magic, "SWAPSPACE2", 10); > -} > - > static void ramzswap_ioctl_get_stats(struct ramzswap *rzs, > struct ramzswap_ioctl_stats *s) > { > @@ -202,31 +188,22 @@ out: > rzs->table[index].offset = 0; > } > > -static int handle_zero_page(struct bio *bio) > +static void handle_zero_page(struct page *page, u32 index) It doesn't use index. > { > void *user_mem; > - struct page *page = bio->bi_io_vec[0].bv_page; > > user_mem = kmap_atomic(page, KM_USER0); > memset(user_mem, 0, PAGE_SIZE); > kunmap_atomic(user_mem, KM_USER0); > > flush_dcache_page(page); > - > - set_bit(BIO_UPTODATE, &bio->bi_flags); > - bio_endio(bio, 0); > - return 0; > } > > -static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio) > +static void handle_uncompressed_page(struct ramzswap *rzs, > + struct page *page, u32 index) > { > - u32 index; > - struct page *page; > unsigned char *user_mem, *cmem; > > - page = bio->bi_io_vec[0].bv_page; > - index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; > - > user_mem = kmap_atomic(page, KM_USER0); > cmem = kmap_atomic(rzs->table[index].page, KM_USER1) + > rzs->table[index].offset; > @@ -236,79 +213,72 @@ static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio) > kunmap_atomic(cmem, KM_USER1); > > flush_dcache_page(page); > - > - set_bit(BIO_UPTODATE, &bio->bi_flags); > - bio_endio(bio, 0); > - return 0; > -} > - > -/* > - * Called when request page is not present in ramzswap. > - * This is an attempt to read before any previous write > - * to this location - this happens due to readahead when > - * swap device is read from user-space (e.g. during swapon) > - */ > -static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio) > -{ > - pr_debug("Read before write on swap device: " > - "sector=%lu, size=%u, offset=%u\n", > - (ulong)(bio->bi_sector), bio->bi_size, > - bio->bi_io_vec[0].bv_offset); > - > - /* Do nothing. Just return success */ > - set_bit(BIO_UPTODATE, &bio->bi_flags); > - bio_endio(bio, 0); > - return 0; > } > > static int ramzswap_read(struct ramzswap *rzs, struct bio *bio) > { > - int ret; > + > + int i; > u32 index; > - size_t clen; > - struct page *page; > - struct zobj_header *zheader; > - unsigned char *user_mem, *cmem; > + struct bio_vec *bvec; > > rzs_stat64_inc(rzs, &rzs->stats.num_reads); > > - page = bio->bi_io_vec[0].bv_page; > index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; > + bio_for_each_segment(bvec, bio, i) { > + int ret; > + size_t clen; > + struct page *page; > + struct zobj_header *zheader; > + unsigned char *user_mem, *cmem; > > - if (rzs_test_flag(rzs, index, RZS_ZERO)) > - return handle_zero_page(bio); > + page = bvec->bv_page; > > - /* Requested page is not present in compressed area */ > - if (!rzs->table[index].page) > - return handle_ramzswap_fault(rzs, bio); > + if (rzs_test_flag(rzs, index, RZS_ZERO)) { > + handle_zero_page(page, index); > + continue; > + } > > - /* Page is stored uncompressed since it's incompressible */ > - if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) > - return handle_uncompressed_page(rzs, bio); > + /* Requested page is not present in compressed area */ > + if (unlikely(!rzs->table[index].page)) { > + pr_debug("Read before write on swap device: " ^^^^ It's not ramzswap any more. It is zram. :) > + "sector=%lu, size=%u", > + (ulong)(bio->bi_sector), bio->bi_size); > + /* Do nothing */ > + continue; > + } > > - user_mem = kmap_atomic(page, KM_USER0); > - clen = PAGE_SIZE; > + /* Page is stored uncompressed since it's incompressible */ > + if (unlikely(rzs_test_flag(rzs, index, RZS_UNCOMPRESSED))) { > + handle_uncompressed_page(rzs, page, index); > + continue; > + } > > - cmem = kmap_atomic(rzs->table[index].page, KM_USER1) + > - rzs->table[index].offset; > + user_mem = kmap_atomic(page, KM_USER0); > + clen = PAGE_SIZE; > > - ret = lzo1x_decompress_safe( > - cmem + sizeof(*zheader), > - xv_get_object_size(cmem) - sizeof(*zheader), > - user_mem, &clen); > + cmem = kmap_atomic(rzs->table[index].page, KM_USER1) + > + rzs->table[index].offset; > > - kunmap_atomic(user_mem, KM_USER0); > - kunmap_atomic(cmem, KM_USER1); > + ret = lzo1x_decompress_safe( > + cmem + sizeof(*zheader), > + xv_get_object_size(cmem) - sizeof(*zheader), > + user_mem, &clen); > > - /* should NEVER happen */ > - if (unlikely(ret != LZO_E_OK)) { > - pr_err("Decompression failed! err=%d, page=%u\n", > - ret, index); > - rzs_stat64_inc(rzs, &rzs->stats.failed_reads); > - goto out; > - } > + kunmap_atomic(user_mem, KM_USER0); > + kunmap_atomic(cmem, KM_USER1); > > - flush_dcache_page(page); > + /* should NEVER happen */ > + if (unlikely(ret != LZO_E_OK)) { > + pr_err("Decompression failed! err=%d, page=%u\n", > + ret, index); > + rzs_stat64_inc(rzs, &rzs->stats.failed_reads); > + goto out; > + } > + > + flush_dcache_page(page); > + index++; > + } > > set_bit(BIO_UPTODATE, &bio->bi_flags); > bio_endio(bio, 0); > @@ -321,108 +291,120 @@ out: > > static int ramzswap_write(struct ramzswap *rzs, struct bio *bio) > { > - int ret; > - u32 offset, index; > - size_t clen; > - struct zobj_header *zheader; > - struct page *page, *page_store; > - unsigned char *user_mem, *cmem, *src; > + int i; > + u32 index; > + struct bio_vec *bvec; > > rzs_stat64_inc(rzs, &rzs->stats.num_writes); > > - page = bio->bi_io_vec[0].bv_page; > index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; > > - src = rzs->compress_buffer; > + bio_for_each_segment(bvec, bio, i) { > + int ret; > + u32 offset; > + size_t clen; > + struct zobj_header *zheader; > + struct page *page, *page_store; > + unsigned char *user_mem, *cmem, *src; > > - mutex_lock(&rzs->lock); > + page = bvec->bv_page; > + src = rzs->compress_buffer; > > - user_mem = kmap_atomic(page, KM_USER0); > - if (page_zero_filled(user_mem)) { > - kunmap_atomic(user_mem, KM_USER0); > - mutex_unlock(&rzs->lock); > - rzs_stat_inc(&rzs->stats.pages_zero); > - rzs_set_flag(rzs, index, RZS_ZERO); > + /* > + * System overwrites unused sectors. Free memory associated > + * with this sector now. > + */ > + if (rzs->table[index].page || > + rzs_test_flag(rzs, index, RZS_ZERO)) > + ramzswap_free_page(rzs, index); > > - set_bit(BIO_UPTODATE, &bio->bi_flags); > - bio_endio(bio, 0); > - return 0; > - } > + mutex_lock(&rzs->lock); > > - ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen, > - rzs->compress_workmem); > + user_mem = kmap_atomic(page, KM_USER0); > + if (page_zero_filled(user_mem)) { > + kunmap_atomic(user_mem, KM_USER0); > + mutex_unlock(&rzs->lock); > + rzs_stat_inc(&rzs->stats.pages_zero); > + rzs_set_flag(rzs, index, RZS_ZERO); > + continue; > + } > > - kunmap_atomic(user_mem, KM_USER0); > + ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen, > + rzs->compress_workmem); > > - if (unlikely(ret != LZO_E_OK)) { > - mutex_unlock(&rzs->lock); > - pr_err("Compression failed! err=%d\n", ret); > - rzs_stat64_inc(rzs, &rzs->stats.failed_writes); > - goto out; > - } > + kunmap_atomic(user_mem, KM_USER0); > > - /* > - * Page is incompressible. Store it as-is (uncompressed) > - * since we do not want to return too many swap write > - * errors which has side effect of hanging the system. > - */ > - if (unlikely(clen > max_zpage_size)) { > - clen = PAGE_SIZE; > - page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM); > - if (unlikely(!page_store)) { > + if (unlikely(ret != LZO_E_OK)) { > mutex_unlock(&rzs->lock); > - pr_info("Error allocating memory for incompressible " > - "page: %u\n", index); > + pr_err("Compression failed! err=%d\n", ret); > rzs_stat64_inc(rzs, &rzs->stats.failed_writes); > goto out; > } > > - offset = 0; > - rzs_set_flag(rzs, index, RZS_UNCOMPRESSED); > - rzs_stat_inc(&rzs->stats.pages_expand); > - rzs->table[index].page = page_store; > - src = kmap_atomic(page, KM_USER0); > - goto memstore; > - } > + /* > + * Page is incompressible. Store it as-is (uncompressed) > + * since we do not want to return too many swap write > + * errors which has side effect of hanging the system. > + */ > + if (unlikely(clen > max_zpage_size)) { > + clen = PAGE_SIZE; > + page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM); > + if (unlikely(!page_store)) { > + mutex_unlock(&rzs->lock); What's purpose of rzs->lock? Please, Document it. And please, take care of "swap" mentioned in comments. I think code doesn't have big problem. But my feel is we can clean up some portion of codes. But it's not a big deal. It doesn't have any problem to work. So I want to add zram into linux-next, too. -- Kind regards, Minchan Kim _______________________________________________ devel mailing list devel@xxxxxxxxxxxxxxxxxxxxxx http://driverdev.linuxdriverproject.org/mailman/listinfo/devel