Hi all, to through some more confusion in an already heated debate, here is the 'dm-ssdcache' module I recently wrote. And held a talk about at LinuxCon Europe 2011 :-) Upon request I'll be generating a proper patch ... Not saying it's perfect, and surely doesn't have all the nifty features bcache has, but it should serve as a 'minimal' implementation. And probaly will start yet another discussion, if nothing else. So, comments etc are welcome. Cheers, Hannes -- Dr. Hannes Reinecke zSeries & Storage hare@xxxxxxx +49 911 74053 688 SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
/* * dm-ssdcache.c * * Copyright (c) 2011 Hannes Reinecke, SUSE Linux Products GmbH * * This file is released under the GPL. */ #include "dm.h" #include <linux/module.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> #define DM_MSG_PREFIX "ssdcache: " // #define SSD_DEBUG #define SSD_LOG #define SSDCACHE_USE_RADIX_TREE #ifdef SSD_LOG #define DPRINTK( s, arg... ) printk(DM_MSG_PREFIX s "\n", ##arg) #define WPRINTK( w, s, arg... ) printk(DM_MSG_PREFIX "%lu: %s (cte %lx:%02lx): "\ s "\n", (w)->nr, __FUNCTION__, \ (w)->cmd->hash, \ (w)->cte_idx, ##arg) #else #define DPRINTK( s, arg... ) #define WPRINTK( w, s, arg... ) #endif #define SSDCACHE_COPY_PAGES 1024 #define MIN_SIO_ITEMS 1024 #define MIN_CTE_NUM 512 #define MIN_CMD_NUM 64 #define DEFAULT_CTE_NUM 4096 #define DEFAULT_BLOCKSIZE 256 #define DEFAULT_ALIASING 16 #define DEFAULT_ASSOCIATIVITY 4 /* Caching modes */ enum ssdcache_mode_t { CACHE_MODE_WRITETHROUGH, CACHE_MODE_WRITEBACK, CACHE_MODE_READCACHE, }; /* Caching strategies */ enum ssdcache_strategy_t { CACHE_LRU, CACHE_LFU, }; struct ssdcache_md; struct ssdcache_io; struct ssdcache_te { unsigned long index; /* Offset within table entry block */ unsigned long atime; /* Timestamp of the block's last access */ unsigned long count; /* Number of accesses */ DECLARE_BITMAP(clean, DEFAULT_BLOCKSIZE); DECLARE_BITMAP(target_busy, DEFAULT_BLOCKSIZE); DECLARE_BITMAP(cache_busy, DEFAULT_BLOCKSIZE); sector_t sector; /* Sector number on target device */ struct ssdcache_md *md; /* Backlink to metadirectory */ struct rcu_head rcu; }; struct ssdcache_md { spinlock_t lock; /* Lock to protect operations on the bio list */ unsigned long hash; /* Hash number */ unsigned int num_cte; /* Number of table entries */ unsigned long atime; struct ssdcache_ctx *sc; struct ssdcache_te *te[DEFAULT_ALIASING]; /* RCU Table entries */ }; struct ssdcache_options { unsigned int assoc; enum ssdcache_mode_t mode; enum ssdcache_strategy_t strategy; unsigned async_lookup:1; unsigned disable_writeback:1; unsigned queue_busy:1; unsigned skip_write_insert:1; unsigned evict_on_write:1; unsigned cmd_preload:1; }; struct ssdcache_ctx { struct dm_dev *target_dev; struct dm_dev *cache_dev; struct dm_io_client *iocp; #ifdef SSDCACHE_USE_RADIX_TREE struct radix_tree_root md_tree; #else struct ssdcache_md **md_table; #endif spinlock_t cmd_lock; unsigned long hash_bits; unsigned long block_size; unsigned long block_mask; sector_t data_offset; unsigned long nr_sio; unsigned long sio_active; unsigned long cte_active; struct ssdcache_options options; unsigned long read_clean; unsigned long read_busy; unsigned long read_invalid; unsigned long read_miss; unsigned long write_clean; unsigned long write_busy; unsigned long write_cancel; unsigned long write_invalid; unsigned long write_miss; unsigned long write_done; unsigned long write_skip; unsigned long lookup_failed; unsigned long lookup_busy; unsigned long cache_bypassed; unsigned long cache_overruns; unsigned long cache_evictions; unsigned long writeback_cancelled; unsigned long bio_cancelled; }; struct ssdcache_io { struct list_head list; spinlock_t lock; struct kref kref; unsigned long nr; struct ssdcache_ctx *sc; struct ssdcache_md *cmd; long cte_idx; struct bio *bio; struct bio *writeback_bio; unsigned long bio_sector; DECLARE_BITMAP(bio_mask, DEFAULT_BLOCKSIZE); int error; }; static enum ssdcache_mode_t default_cache_mode = CACHE_MODE_WRITETHROUGH; static enum ssdcache_strategy_t default_cache_strategy = CACHE_LFU; #define CACHE_IS_WRITETHROUGH(sc) \ ((sc)->options.mode == CACHE_MODE_WRITETHROUGH) #define CACHE_IS_WRITEBACK(sc) \ ((sc)->options.mode == CACHE_MODE_WRITEBACK) #define CACHE_IS_READCACHE(sc) \ ((sc)->options.mode == CACHE_MODE_READCACHE) #define CACHE_USE_LRU(sc) ((sc)->options.strategy == CACHE_LRU) static DEFINE_SPINLOCK(_work_lock); static struct workqueue_struct *_ssdcached_wq; static struct work_struct _ssdcached_work; static LIST_HEAD(_cte_work); static LIST_HEAD(_io_work); static struct kmem_cache *_sio_cache; static struct kmem_cache *_cmd_cache; static struct kmem_cache *_cte_cache; static mempool_t *_sio_pool; static mempool_t *_cmd_pool; static mempool_t *_cte_pool; /* Cache metadirectory states */ enum cmd_state { CMD_STATE_UNMAPPED, CMD_STATE_MAPPED, CMD_STATE_RESERVED, }; enum cte_match_t { CTE_READ_CLEAN, CTE_READ_BUSY, CTE_READ_INVALID, CTE_READ_MISS, CTE_WRITE_CLEAN, CTE_WRITE_BUSY, CTE_WRITE_CANCEL, CTE_WRITE_INVALID, CTE_WRITE_MISS, CTE_WRITE_DONE, CTE_WRITE_SKIP, CTE_LOOKUP_FAILED, }; /* * Slab pools */ static int pool_init(void) { _sio_cache = kmem_cache_create("ssdcache-sio", sizeof(struct ssdcache_io), __alignof__(struct ssdcache_io), 0, NULL); if (!_sio_cache) return -ENOMEM; _cmd_cache = kmem_cache_create("ssdcache-cmd", sizeof(struct ssdcache_md), __alignof__(struct ssdcache_md), 0, NULL); if (!_cmd_cache) { kmem_cache_destroy(_sio_cache); return -ENOMEM; } _cte_cache = kmem_cache_create("ssdcache-cte", sizeof(struct ssdcache_te), __alignof__(struct ssdcache_te), 0, NULL); if (!_cte_cache) { kmem_cache_destroy(_cmd_cache); kmem_cache_destroy(_sio_cache); return -ENOMEM; } _sio_pool = mempool_create(MIN_SIO_ITEMS, mempool_alloc_slab, mempool_free_slab, _sio_cache); if (!_sio_pool) { kmem_cache_destroy(_cte_cache); kmem_cache_destroy(_cmd_cache); kmem_cache_destroy(_sio_cache); return -ENOMEM; } _cmd_pool = mempool_create(MIN_CMD_NUM, mempool_alloc_slab, mempool_free_slab, _cmd_cache); if (!_cmd_pool) { mempool_destroy(_sio_pool); kmem_cache_destroy(_cte_cache); kmem_cache_destroy(_cmd_cache); kmem_cache_destroy(_sio_cache); } _cte_pool = mempool_create(MIN_CTE_NUM, mempool_alloc_slab, mempool_free_slab, _cte_cache); if (!_cte_pool) { mempool_destroy(_cmd_pool); mempool_destroy(_sio_pool); kmem_cache_destroy(_cte_cache); kmem_cache_destroy(_cmd_cache); kmem_cache_destroy(_sio_cache); } return 0; } static void pool_exit(void) { mempool_destroy(_cte_pool); mempool_destroy(_cmd_pool); mempool_destroy(_sio_pool); kmem_cache_destroy(_cte_cache); kmem_cache_destroy(_cmd_cache); kmem_cache_destroy(_sio_cache); } /* * cache metadirectory handling */ static inline struct ssdcache_md *cmd_lookup(struct ssdcache_ctx *sc, unsigned long hash_number) { struct ssdcache_md *cmd; rcu_read_lock(); #ifdef SSDCACHE_USE_RADIX_TREE cmd = radix_tree_lookup(&sc->md_tree, hash_number); #else cmd = rcu_dereference(sc->md_table[hash_number]); #endif rcu_read_unlock(); return cmd; } static inline struct ssdcache_md *cmd_insert(struct ssdcache_ctx *sc, unsigned long hash_number) { struct ssdcache_md *cmd; cmd = mempool_alloc(_cmd_pool, GFP_NOIO | __GFP_ZERO); if (!cmd) return NULL; cmd->hash = hash_number; cmd->num_cte = DEFAULT_ALIASING; spin_lock_init(&cmd->lock); cmd->atime = jiffies; cmd->sc = sc; #ifdef SSDCACHE_USE_RADIX_TREE if (radix_tree_preload(GFP_NOIO)) { mempool_free(cmd, _cmd_pool); return NULL; } #endif spin_lock(&sc->cmd_lock); #ifdef SSDCACHE_USE_RADIX_TREE if (radix_tree_insert(&sc->md_tree, hash_number, cmd)) { mempool_free(cmd, _cmd_pool); cmd = radix_tree_lookup(&sc->md_tree, hash_number); BUG_ON(!cmd); BUG_ON(cmd->hash != hash_number); } #else rcu_assign_pointer(sc->md_table[hash_number], cmd); #endif spin_unlock(&sc->cmd_lock); #ifdef SSDCACHE_USE_RADIX_TREE radix_tree_preload_end(); #endif return cmd; } static void cmd_remove(struct ssdcache_md *cmd) { struct ssdcache_te *cte; int j; if (!cmd) return; for (j = 0; j < cmd->num_cte; j++) { spin_lock_irq(&cmd->lock); cte = cmd->te[j]; if (cte) rcu_assign_pointer(cmd->te[j], NULL); spin_unlock_irq(&cmd->lock); if (cte) { synchronize_rcu(); mempool_free(cte, _cte_pool); } } mempool_free(cmd, _cmd_pool); } #define cte_bio_align(s,b) ((b)->bi_sector & ~(s)->block_mask) #define cte_bio_offset(s,b) ((b)->bi_sector & (s)->block_mask) void sio_bio_mask(struct ssdcache_io *sio, struct bio *bio) { unsigned long offset = cte_bio_offset(sio->sc, bio); int i; for (i = 0; i < bio_sectors(bio); i++) { set_bit(offset + i, sio->bio_mask); } } struct ssdcache_te * cte_new(struct ssdcache_ctx *sc, struct ssdcache_md *cmd, unsigned int index) { struct ssdcache_te *newcte; newcte = mempool_alloc(_cte_pool, GFP_NOWAIT | __GFP_ZERO); if (!newcte) return NULL; newcte->index = index; newcte->atime = jiffies; newcte->count = 1; newcte->md = cmd; sc->cte_active++; return newcte; } static void cte_reset(struct rcu_head *rp) { struct ssdcache_te *cte = container_of(rp, struct ssdcache_te, rcu); cte->md->sc->cte_active--; mempool_free(cte, _cte_pool); } static bool cte_is_clean(struct ssdcache_te *cte, unsigned long *mask) { DECLARE_BITMAP(tmpmask, DEFAULT_BLOCKSIZE); bitmap_and(tmpmask, cte->clean, mask, DEFAULT_BLOCKSIZE); return bitmap_equal(tmpmask, mask, DEFAULT_BLOCKSIZE); } static bool cte_cache_is_busy(struct ssdcache_te *cte, unsigned long *mask) { DECLARE_BITMAP(tmpmask, DEFAULT_BLOCKSIZE); bitmap_and(tmpmask, cte->cache_busy, mask, DEFAULT_BLOCKSIZE); return !bitmap_empty(tmpmask, DEFAULT_BLOCKSIZE); } static bool cte_target_is_busy(struct ssdcache_te *cte, unsigned long *mask) { DECLARE_BITMAP(tmpmask, DEFAULT_BLOCKSIZE); bitmap_and(tmpmask, cte->target_busy, mask, DEFAULT_BLOCKSIZE); return !bitmap_empty(tmpmask, DEFAULT_BLOCKSIZE); } #define cte_set_bitmap(c,s,m) bitmap_or((c)->m , (c)->m , \ (s)->bio_mask, DEFAULT_BLOCKSIZE) #define cte_unset_bitmap(c,s,m) bitmap_andnot((c)->m , (c)->m , \ (s)->bio_mask, DEFAULT_BLOCKSIZE) static bool sio_cache_is_busy(struct ssdcache_io *sio) { struct ssdcache_te *cte; bool match = false; if (!sio->cmd) return false; BUG_ON(sio->cte_idx == -1); rcu_read_lock(); cte = rcu_dereference(sio->cmd->te[sio->cte_idx]); if (cte) match = cte_cache_is_busy(cte, sio->bio_mask); rcu_read_unlock(); return match; } static bool sio_target_is_busy(struct ssdcache_io *sio) { struct ssdcache_te *cte; bool match = false; if (!sio->cmd || sio->cte_idx == -1) return false; rcu_read_lock(); cte = rcu_dereference(sio->cmd->te[sio->cte_idx]); if (cte) match = cte_target_is_busy(cte, sio->bio_mask); rcu_read_unlock(); return match; } static bool sio_match_sector(struct ssdcache_io *sio) { struct ssdcache_te *cte; bool match = false; if (!sio || !sio->cmd || sio->cte_idx == -1) return false; rcu_read_lock(); cte = rcu_dereference(sio->cmd->te[sio->cte_idx]); if (cte) match = (sio->bio_sector == cte->sector); rcu_read_unlock(); return match; } static void sio_cleanup_cte(struct ssdcache_io *sio) { struct ssdcache_te *newcte, *oldcte; bool is_invalid = false; BUG_ON(!sio); BUG_ON(!sio->cmd); BUG_ON(sio->cte_idx == -1); rcu_read_lock(); oldcte = rcu_dereference(sio->cmd->te[sio->cte_idx]); if (oldcte && bitmap_empty(oldcte->clean, DEFAULT_BLOCKSIZE) && bitmap_empty(oldcte->target_busy, DEFAULT_BLOCKSIZE) && bitmap_empty(oldcte->cache_busy, DEFAULT_BLOCKSIZE)) is_invalid = true; rcu_read_unlock(); if (!oldcte) return; if (!is_invalid) { newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx); /* Failure is okay; we'll drop the cte then */ } else { #ifdef SSD_DEBUG WPRINTK(sio, "drop invalid cte"); #endif newcte = NULL; } spin_lock_irq(&sio->cmd->lock); if (newcte) { oldcte = sio->cmd->te[sio->cte_idx]; *newcte = *oldcte; newcte->atime = jiffies; newcte->count++; } rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte); spin_unlock_irq(&sio->cmd->lock); if (oldcte) call_rcu(&oldcte->rcu, cte_reset); } static void sio_finish_cache_write(struct ssdcache_io *sio) { struct ssdcache_te *newcte, *oldcte; BUG_ON(!sio); BUG_ON(!sio->cmd); BUG_ON(sio->cte_idx == -1); /* Check if we should drop the old cte */ newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx); spin_lock_irq(&sio->cmd->lock); oldcte = sio->cmd->te[sio->cte_idx]; if (newcte) { if (oldcte) *newcte = *oldcte; newcte->atime = jiffies; newcte->count++; /* Reset busy bitmaps */ cte_unset_bitmap(newcte, sio, cache_busy); /* Update the clean bitmap */ if (sio->error) cte_unset_bitmap(newcte, sio, clean); else cte_set_bitmap(newcte, sio, clean); } rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte); spin_unlock_irq(&sio->cmd->lock); if (oldcte) call_rcu(&oldcte->rcu, cte_reset); } static void sio_finish_target_write(struct ssdcache_io *sio) { struct ssdcache_te *newcte, *oldcte; BUG_ON(!sio); BUG_ON(!sio->cmd); BUG_ON(sio->cte_idx == -1); /* Check if we should drop the old cte */ newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx); spin_lock_irq(&sio->cmd->lock); oldcte = sio->cmd->te[sio->cte_idx]; if (newcte) { if (oldcte) *newcte = *oldcte; newcte->atime = jiffies; newcte->count++; /* Reset busy bitmaps */ if (CACHE_IS_READCACHE(sio->sc)) { cte_unset_bitmap(newcte, sio, cache_busy); } else { cte_unset_bitmap(newcte, sio, target_busy); /* Upon error reset the clean bitmap */ if (sio->error) cte_unset_bitmap(newcte, sio, clean); } } rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte); spin_unlock_irq(&sio->cmd->lock); if (oldcte) call_rcu(&oldcte->rcu, cte_reset); } static enum cte_match_t sio_new_cache_write(struct ssdcache_io *sio, int rw) { struct ssdcache_te *newcte, *oldcte; enum cte_match_t retval; newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx); if (!newcte) { /* Ouch */ WPRINTK(sio, "oom, drop old cte"); sio->cte_idx = -1; sio->sc->lookup_failed++; retval = CTE_LOOKUP_FAILED; } spin_lock_irq(&sio->cmd->lock); if (newcte) { if (rw == WRITE) { cte_set_bitmap(newcte, sio, cache_busy); if (!CACHE_IS_READCACHE(sio->sc)) cte_set_bitmap(newcte, sio, target_busy); sio->sc->write_miss++; retval = CTE_WRITE_MISS; } else { sio->sc->read_miss++; retval = CTE_READ_MISS; } newcte->sector = sio->bio_sector; } oldcte = sio->cmd->te[sio->cte_idx]; rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte); spin_unlock_irq(&sio->cmd->lock); if (oldcte) call_rcu(&oldcte->rcu, cte_reset); return retval; } static void cte_start_write(struct ssdcache_io *sio) { struct ssdcache_te *newcte, *oldcte; /* Check if we should drop the old cte */ newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx); if (!newcte) return; spin_lock_irq(&sio->cmd->lock); oldcte = sio->cmd->te[sio->cte_idx]; BUG_ON(!oldcte); *newcte = *oldcte; newcte->count++; cte_unset_bitmap(newcte, sio, clean); cte_set_bitmap(newcte, sio, cache_busy); if (!CACHE_IS_READCACHE(sio->sc)) { if (sio->sc->options.async_lookup) { if (!sio->error) { cte_set_bitmap(newcte, sio, target_busy); } } else { cte_set_bitmap(newcte, sio, target_busy); } } newcte->atime = jiffies; rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte); spin_unlock_irq(&sio->cmd->lock); call_rcu(&oldcte->rcu, cte_reset); } static void cte_start_cache_write(struct ssdcache_io *sio) { struct ssdcache_te *newcte, *oldcte; /* Check if we should drop the old cte */ newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx); if (!newcte) return; spin_lock_irq(&sio->cmd->lock); oldcte = sio->cmd->te[sio->cte_idx]; BUG_ON(!oldcte); *newcte = *oldcte; newcte->count++; cte_unset_bitmap(newcte, sio, clean); cte_set_bitmap(newcte, sio, cache_busy); newcte->atime = jiffies; rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte); spin_unlock_irq(&sio->cmd->lock); call_rcu(&oldcte->rcu, cte_reset); } static void cte_start_target_write(struct ssdcache_io *sio) { struct ssdcache_te *newcte, *oldcte; /* Check if we should drop the old cte */ newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx); if (!newcte) return; spin_lock_irq(&sio->cmd->lock); oldcte = sio->cmd->te[sio->cte_idx]; BUG_ON(!oldcte); *newcte = *oldcte; newcte->count++; cte_set_bitmap(newcte, sio, target_busy); newcte->atime = jiffies; rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte); spin_unlock_irq(&sio->cmd->lock); call_rcu(&oldcte->rcu, cte_reset); } static void cte_cancel_target_write(struct ssdcache_io *sio) { struct ssdcache_te *newcte, *oldcte; /* Check if we should drop the old cte */ newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx); if (!newcte) return; spin_lock_irq(&sio->cmd->lock); oldcte = sio->cmd->te[sio->cte_idx]; BUG_ON(!oldcte); *newcte = *oldcte; newcte->count++; cte_unset_bitmap(newcte, sio, clean); cte_unset_bitmap(newcte, sio, cache_busy); cte_unset_bitmap(newcte, sio, target_busy); newcte->atime = jiffies; rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte); spin_unlock_irq(&sio->cmd->lock); call_rcu(&oldcte->rcu, cte_reset); } static void cte_cancel_cache_write(struct ssdcache_io *sio) { struct ssdcache_te *newcte, *oldcte; /* Check if we should drop the old cte */ newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx); if (!newcte) return; spin_lock_irq(&sio->cmd->lock); oldcte = sio->cmd->te[sio->cte_idx]; BUG_ON(!oldcte); *newcte = *oldcte; newcte->count++; cte_unset_bitmap(newcte, sio, clean); cte_unset_bitmap(newcte, sio, cache_busy); newcte->atime = jiffies; rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte); spin_unlock_irq(&sio->cmd->lock); call_rcu(&oldcte->rcu, cte_reset); } static void sio_cte_invalidate(struct ssdcache_io *sio) { struct ssdcache_te *oldcte; spin_lock_irq(&sio->cmd->lock); oldcte = sio->cmd->te[sio->cte_idx]; BUG_ON(!oldcte); rcu_assign_pointer(sio->cmd->te[sio->cte_idx], NULL); spin_unlock_irq(&sio->cmd->lock); call_rcu(&oldcte->rcu, cte_reset); } /* * Workqueue handling */ static struct ssdcache_io *ssdcache_create_sio(struct ssdcache_ctx *sc) { struct ssdcache_io *sio; sio = mempool_alloc(_sio_pool, GFP_NOIO); if (!sio) return NULL; memset(sio, 0, sizeof(struct ssdcache_io)); sio->sc = sc; sio->cte_idx = -1; sio->nr = ++sc->nr_sio; sc->sio_active++; kref_init(&sio->kref); INIT_LIST_HEAD(&sio->list); spin_lock_init(&sio->lock); return sio; } static void ssdcache_destroy_sio(struct kref *kref) { struct ssdcache_io *sio = container_of(kref, struct ssdcache_io, kref); BUG_ON(!list_empty(&sio->list)); if (sio->bio) { struct bio_vec *bvec; int i; bio_for_each_segment(bvec, sio->bio, i) put_page(bvec->bv_page); bio_put(sio->bio); sio->bio = NULL; } if (sio->error != -ESTALE && sio_match_sector(sio) && !sio_cache_is_busy(sio) && !sio_target_is_busy(sio)) sio_cleanup_cte(sio); sio->sc->sio_active--; mempool_free(sio, _sio_pool); } static void ssdcache_get_sio(struct ssdcache_io *sio) { kref_get(&sio->kref); } static int ssdcache_put_sio(struct ssdcache_io *sio) { return kref_put(&sio->kref, ssdcache_destroy_sio); } static inline void push_sio(struct list_head *q, struct ssdcache_io *sio) { unsigned long flags; spin_lock_irqsave(&_work_lock, flags); list_add_tail(&sio->list, q); spin_unlock_irqrestore(&_work_lock, flags); } static void ssdcache_schedule_sio(struct ssdcache_io *sio) { ssdcache_get_sio(sio); push_sio(&_io_work, sio); queue_work(_ssdcached_wq, &_ssdcached_work); } static void map_secondary_bio(struct ssdcache_io *sio, struct bio *bio) { struct bio_vec *bvec; int i; /* Kick off secondary writes */ sio->bio = bio_clone(bio, GFP_NOIO); BUG_ON(!sio->bio); bio_for_each_segment(bvec, sio->bio, i) get_page(bvec->bv_page); } static void map_writeback_bio(struct ssdcache_io *sio, struct bio *bio) { struct bio_vec *bvec; int i; BUG_ON(sio->writeback_bio); sio->writeback_bio = bio_clone(bio, GFP_NOIO); if (!sio->writeback_bio) { WPRINTK(sio, "bio_clone failed"); return; } sio->writeback_bio->bi_rw |= WRITE; bio_for_each_segment(bvec, sio->writeback_bio, i) get_page(bvec->bv_page); } static void unmap_writeback_bio(struct ssdcache_io *sio) { int i; struct bio_vec *bvec; if (sio->writeback_bio) { /* Release bio */ bio_for_each_segment(bvec, sio->writeback_bio, i) put_page(bvec->bv_page); bio_put(sio->writeback_bio); sio->writeback_bio = NULL; } } static void cache_io_callback(unsigned long error, void *context) { struct ssdcache_io *sio = context; if (!sio_match_sector(sio)) { WPRINTK(sio, "cte overrun, not updating state"); sio->sc->cache_overruns++; } else if (!sio_cache_is_busy(sio)) { if (!CACHE_IS_READCACHE(sio->sc)) WPRINTK(sio, "cte not busy, not updating state"); } else { if (error) { WPRINTK(sio, "finished with %lu", error); sio->error = -EIO; } if (sio->error == -EUCLEAN) { #ifdef SSD_DEBUG WPRINTK(sio, "reset EUCLEAN"); #endif sio->error = 0; } sio_finish_cache_write(sio); } unmap_writeback_bio(sio); ssdcache_put_sio(sio); queue_work(_ssdcached_wq, &_ssdcached_work); } static void target_io_callback(unsigned long error, void *context) { struct ssdcache_io *sio = context; if (!sio->cmd) { #ifdef SSD_DEBUG DPRINTK("%lu: %s: cte lookup not finished", sio->nr, __FUNCTION__); #endif if (!sio->error) sio->error = -EUCLEAN; sio->sc->cache_overruns++; } else if (!sio_match_sector(sio)) { WPRINTK(sio, "cte overrun, not updating state"); sio->sc->cache_overruns++; } else if (!sio_target_is_busy(sio)) { WPRINTK(sio, "cte not busy, not updating state"); } else { if (error) { WPRINTK(sio, "finished with %lu", error); sio->error = -EIO; } sio_finish_target_write(sio); } ssdcache_put_sio(sio); queue_work(_ssdcached_wq, &_ssdcached_work); } static inline sector_t to_cache_sector(struct ssdcache_io *sio, sector_t data_sector) { sector_t sector_offset, cte_offset, cmd_offset, cache_sector; sector_offset = data_sector & sio->sc->block_mask; BUG_ON(!sio->cmd); BUG_ON(sio->cte_idx < 0); cte_offset = sio->cte_idx * sio->sc->block_size; cmd_offset = sio->cmd->hash * sio->cmd->num_cte * sio->sc->block_size; cache_sector = sio->sc->data_offset + cmd_offset + cte_offset + sector_offset; if (cache_sector > i_size_read(sio->sc->cache_dev->bdev->bd_inode)) { WPRINTK(sio, "access beyond end of device"); BUG(); } return cache_sector; } static void write_to_cache(struct ssdcache_io *sio, struct bio *bio) { struct dm_io_region cache; struct dm_io_request iorq; cache.bdev = sio->sc->cache_dev->bdev; cache.sector = to_cache_sector(sio, bio->bi_sector); cache.count = bio_sectors(bio); iorq.bi_rw = WRITE; iorq.mem.type = DM_IO_BVEC; iorq.mem.ptr.bvec = bio_iovec(bio); iorq.notify.fn = cache_io_callback; iorq.notify.context = sio; iorq.client = sio->sc->iocp; dm_io(&iorq, 1, &cache, NULL); } static void write_to_target(struct ssdcache_io *sio, struct bio *bio) { struct dm_io_region target; struct dm_io_request iorq; target.bdev = sio->sc->target_dev->bdev; target.sector = bio->bi_sector; target.count = bio_sectors(bio); iorq.bi_rw = WRITE; iorq.mem.type = DM_IO_BVEC; iorq.mem.ptr.bvec = bio_iovec(bio); iorq.notify.fn = target_io_callback; iorq.notify.context = sio; iorq.client = sio->sc->iocp; dm_io(&iorq, 1, &target, NULL); } static void sio_start_prefetch(struct ssdcache_io *sio, struct bio *bio) { if (!sio->sc->options.disable_writeback) { /* Setup clone for writing to cache device */ map_writeback_bio(sio, bio); } bio->bi_bdev = sio->sc->target_dev->bdev; } /* * sio_start_write_busy * * That one's a little tricky. * We hit this when a cache write is still in flight. * However, for writethrough it actually only matters * that the _target_ write is completed. * So we can start the target write and defer the cache * write for until after the original cache write completed. */ static void sio_start_write_busy(struct ssdcache_io *sio, struct bio *bio) { if (!CACHE_IS_READCACHE(sio->sc)) /* Setup clone for writing to cache device */ map_writeback_bio(sio, bio); if (!CACHE_IS_WRITEBACK(sio->sc)) { bio->bi_bdev = sio->sc->target_dev->bdev; } else { bio->bi_bdev = sio->sc->cache_dev->bdev; bio->bi_sector = to_cache_sector(sio, bio->bi_sector); } } static void sio_start_write_miss(struct ssdcache_io *sio, struct bio *bio) { if (!sio->sc->options.async_lookup && !CACHE_IS_READCACHE(sio->sc)) { map_secondary_bio(sio, bio); ssdcache_schedule_sio(sio); } if (!CACHE_IS_WRITEBACK(sio->sc)) { bio->bi_bdev = sio->sc->target_dev->bdev; } else { bio->bi_bdev = sio->sc->cache_dev->bdev; bio->bi_sector = to_cache_sector(sio, bio->bi_sector); } } /* * sio_check_writeback * * Check if the pending writeback bio is safe for * submission. * At this point the target read has completed * and we try to writeback the original bio to * the cache to increase the likelyhood of a * cache hit. * However, this is an optimisation. So whenever * the cte has been evicted or a write is already * outstanding on the same cte we can safely * cancel the writeback. */ static bool sio_check_writeback(struct ssdcache_io *sio) { struct ssdcache_te *cte; BUG_ON(!sio); BUG_ON(!sio->cmd); BUG_ON(sio->cte_idx == -1); rcu_read_lock(); cte = rcu_dereference(sio->cmd->te[sio->cte_idx]); rcu_read_unlock(); if (!cte) { #ifdef SSD_DEBUG WPRINTK(sio, "invalid cte"); #endif return false; } if (cte->sector != sio->bio_sector) { #ifdef SSD_DEBUG WPRINTK(sio, "wrong sector %llx %llx", (unsigned long long)cte->sector, (unsigned long long)sio->bio_sector); #endif return false; } /* Check if there is an outstanding cache write */ if (cte_cache_is_busy(cte, sio->bio_mask)) { #ifdef SSD_DEBUG WPRINTK(sio, "cache sector busy"); #endif return false; } return true; } /* * Hashing * * We implement double hashing to avoid hash collisions. */ static unsigned long ssdcache_hash_64(struct ssdcache_ctx *sc, sector_t sector) { unsigned long value, hash_number, sector_shift; sector_shift = fls(sc->block_size) - 1; value = sector >> sector_shift; hash_number = (unsigned long)hash_64(value, sc->hash_bits); return hash_number; } static unsigned long ssdcache_hash_wrap(struct ssdcache_ctx *sc, sector_t sector) { unsigned long sector_shift, value, hash_mask; sector_shift = fls(sc->block_size) - 1; value = sector >> sector_shift; hash_mask = (1UL << sc->hash_bits) - 1; return value & hash_mask; } static unsigned long rehash_block(struct ssdcache_ctx *sc, sector_t sector, unsigned long hash_number) { unsigned long result, hash_mask; hash_mask = (1UL << sc->hash_bits) - 1; result = hash_number + ssdcache_hash_64(sc, sector); if ((result & hash_mask) == hash_number) result++; return result & hash_mask; } static enum cte_match_t cte_match(struct ssdcache_io *sio, int rw) { unsigned long hash_number; unsigned long cte_atime, oldest_atime; unsigned long cte_count, oldest_count; int skip_cmd_instantiation = 0; int evict_cte_on_write = 0; int invalid, oldest, i, index, busy = 0, assoc = 0; enum cte_match_t retval = CTE_LOOKUP_FAILED; hash_number = ssdcache_hash_wrap(sio->sc, sio->bio_sector); if (rw == WRITE && sio->sc->options.skip_write_insert) skip_cmd_instantiation = 1; if (rw == WRITE && sio->sc->options.evict_on_write) evict_cte_on_write = 1; retry: oldest_atime = jiffies; oldest_count = -1; oldest = -1; invalid = -1; index = -1; /* Lookup cmd */ sio->cmd = cmd_lookup(sio->sc, hash_number); if (!sio->cmd) { if (skip_cmd_instantiation) { /* Skip cte instantiation on WRITE */ sio->cte_idx = -1; sio->sc->write_skip++; return CTE_WRITE_SKIP; } if (sio->error) { /* Target write already completed */ sio->cte_idx = -1; sio->sc->write_done++; return CTE_WRITE_DONE; } sio->cmd = cmd_insert(sio->sc, hash_number); if (!sio->cmd) { DPRINTK("%lu: %s: cmd insertion failure", sio->nr, __FUNCTION__); sio->sc->lookup_failed++; sio->cte_idx = -1; retval = CTE_LOOKUP_FAILED; goto out; } else { #ifdef SSD_DEBUG DPRINTK("%lu: %s (cte %lx:0): use first clean entry", sio->nr, __FUNCTION__, sio->cmd->hash); #endif /* Clean cmd, first entry is useable */ invalid = 0; /* Skip cte lookup */ goto found; } } for (i = 0; i < sio->cmd->num_cte; i++) { struct ssdcache_te *cte; rcu_read_lock(); cte = rcu_dereference(sio->cmd->te[i]); rcu_read_unlock(); if (!cte) { if (invalid == -1) invalid = i; continue; } if (cte->sector == sio->bio_sector) { sio->cte_idx = i; if (cte_is_clean(cte, sio->bio_mask) && rw == READ) { sio->sc->read_clean++; retval = CTE_READ_CLEAN; /* WRITE_CLEAN is mapped to WRITE_INVALID */ } else if (cte_target_is_busy(cte, sio->bio_mask)) { /* Target busy */ if (rw == WRITE) { if (CACHE_IS_READCACHE(sio->sc)) { sio_cte_invalidate(sio); sio->sc->write_cancel++; retval = CTE_WRITE_CANCEL; } else if ((CACHE_IS_WRITEBACK(sio->sc)) && sio->sc->options.queue_busy) { cte_start_cache_write(sio); sio->sc->write_busy++; retval = CTE_WRITE_BUSY; } else { cte_cancel_target_write(sio); sio->sc->write_cancel++; retval = CTE_WRITE_CANCEL; } } else { sio->sc->read_busy++; retval = CTE_READ_BUSY; } } else if (cte_cache_is_busy(cte, sio->bio_mask)) { /* Cache busy */ if (rw == WRITE) { if (CACHE_IS_READCACHE(sio->sc)) { sio_cte_invalidate(sio); sio->sc->write_cancel++; retval = CTE_WRITE_CANCEL; } else if (CACHE_IS_WRITETHROUGH(sio->sc) && sio->sc->options.queue_busy) { cte_start_target_write(sio); sio->sc->write_busy++; retval = CTE_WRITE_BUSY; } else { cte_cancel_cache_write(sio); sio->sc->write_cancel++; retval = CTE_WRITE_CANCEL; } } else { sio->sc->read_busy++; retval = CTE_READ_BUSY; } } else { /* Invalid cte sector */ if (rw == WRITE) { cte_start_write(sio); sio->sc->write_invalid++; retval = CTE_WRITE_INVALID; } else { sio->sc->read_invalid++; retval = CTE_READ_INVALID; } } goto out; } /* * Do not attempt to evict entries when * target writes have already completed. */ if (sio->error) { DPRINTK("%lu: %s (cte %lx:%x): error %d", sio->nr, __FUNCTION__, sio->cmd->hash, i, sio->error); continue; } /* Break out if we have found an invalid entry */ if (invalid != -1) break; /* Skip cache eviction on WRITE */ if (!evict_cte_on_write) continue; /* Can only eject non-busy entries */ if (cte_target_is_busy(cte, sio->bio_mask) || cte_cache_is_busy(cte, sio->bio_mask)) { #ifdef SSD_DEBUG DPRINTK("%lu: %s (cte %lx:%x): skip busy cte", sio->nr, __FUNCTION__, sio->cmd->hash, i); #endif busy++; continue; } /* Can only eject CLEAN entries */ if (!cte_is_clean(cte, sio->bio_mask)) { #ifdef SSD_DEBUG DPRINTK("%lu: %s (cte %lx:%x): skip not-clean cte", sio->nr, __FUNCTION__, sio->cmd->hash, i); #endif busy++; continue; } if (CACHE_USE_LRU(sio->sc)) { /* Select the oldest clean entry */ rcu_read_lock(); cte_atime = rcu_dereference(cte)->atime; rcu_read_unlock(); if (time_before_eq(cte_atime, oldest_atime)) { oldest_atime = cte_atime; oldest = i; } } else { /* Select the lowest access count */ rcu_read_lock(); cte_count = rcu_dereference(cte)->count; rcu_read_unlock(); if (cte_count <= oldest_count) { oldest_count = cte_count; oldest = i; } } } if (invalid == -1 && assoc < sio->sc->options.assoc && !sio->error) { hash_number = rehash_block(sio->sc, sio->bio_sector, hash_number); assoc++; #ifdef SSD_DEBUG DPRINTK("%lu: %s (cte %lx:%x): retry with assoc %d", sio->nr, __FUNCTION__, sio->cmd->hash, i, assoc); #endif goto retry; } found: if (invalid != -1) { index = invalid; } else if (oldest != -1) { #ifdef SSD_DEBUG DPRINTK("%lu: %s (cte %lx:%x): drop oldest cte", sio->nr, __FUNCTION__, sio->cmd->hash, oldest); #endif sio->sc->cache_evictions++; index = oldest; } else { index = -1; } if (index != -1) { sio->cte_idx = index; retval = sio_new_cache_write(sio, rw); } else if (sio->error) { sio->cte_idx = -1; sio->sc->write_done++; retval = CTE_WRITE_DONE; } else if (!evict_cte_on_write) { sio->cte_idx = -1; sio->sc->write_skip++; retval = CTE_WRITE_SKIP; } else { DPRINTK("%lu: %s (cte %lx:ff): %d/%d ctes busy", sio->nr, __FUNCTION__, sio->cmd->hash, busy, assoc); sio->cte_idx = -1; sio->sc->lookup_busy++; } out: return retval; } static void sio_lookup_async(struct ssdcache_io *sio) { enum cte_match_t ret = CTE_LOOKUP_FAILED; if (!sio->cmd || sio->cte_idx == -1) ret = cte_match(sio, WRITE); switch (ret) { case CTE_WRITE_INVALID: case CTE_WRITE_CLEAN: case CTE_WRITE_MISS: ssdcache_get_sio(sio); write_to_cache(sio, sio->bio); break; case CTE_WRITE_BUSY: WPRINTK(sio, "cte busy for write"); sio->error = -EBUSY; break; case CTE_WRITE_DONE: #ifdef SSD_DEBUG DPRINTK("%lu: %s: cte already done", sio->nr, __FUNCTION__); #endif break; default: DPRINTK("%lu: %s (cte %lx:%lx): cte lookup failed %d", sio->nr, __FUNCTION__, sio->cmd ? sio->cmd->hash : 0xffff, sio->cte_idx == -1 ? 0xfff : sio->cte_idx, ret); sio->error = -ENOENT; } } static void sio_in_flight(void) { struct ssdcache_io *sio; unsigned long flags; int in_flight = 0; spin_lock_irqsave(&_work_lock, flags); list_for_each_entry(sio, &_io_work, list) { in_flight++; } spin_unlock_irqrestore(&_work_lock, flags); DPRINTK("%d sios in flight", in_flight); } /* * process_sio * * Using list_splice() here shifts processing onto * a local list, so we can't figure out how many * sios are still pending. * Using an open-coded retry loop allows us to * traverse the list of outstanding requests * even from another context/function. */ static void process_sio(struct work_struct *ignored) { unsigned long flags; struct ssdcache_io *sio; retry: spin_lock_irqsave(&_work_lock, flags); if (!list_empty(&_io_work)) { sio = list_first_entry(&_io_work, struct ssdcache_io, list); list_del_init(&sio->list); } else { sio = NULL; } spin_unlock_irqrestore(&_work_lock, flags); if (sio) { if (sio->bio) { /* secondary write */ if (sio->bio->bi_rw & REQ_FLUSH) { /* DM only sends empty flushes */ DPRINTK("%lu: %s: flush done", sio->nr, __FUNCTION__); bio_endio(sio->bio, 0); bio_put(sio->bio); sio->bio = NULL; } else if (!sio_match_sector(sio)) { WPRINTK(sio, "target cte overrun"); sio->error = -ESTALE; sio->sc->cache_overruns++; } else if (CACHE_IS_WRITETHROUGH(sio->sc)) { if (!sio_cache_is_busy(sio)) { WPRINTK(sio, "cache cte not busy"); sio->error = -ESTALE; sio->sc->bio_cancelled++; } else { if (sio->sc->options.async_lookup) sio_lookup_async(sio); else { ssdcache_get_sio(sio); write_to_cache(sio, sio->bio); } } } else { if (!sio_target_is_busy(sio)) { WPRINTK(sio, "target cte not busy"); sio->error = -ESTALE; sio->sc->bio_cancelled++; } else { ssdcache_get_sio(sio); if (sio->sc->options.async_lookup) cte_start_target_write(sio); write_to_target(sio, sio->bio); } } } else if (sio->writeback_bio) { if (!sio_check_writeback(sio)) { /* Cancel writeback */ unmap_writeback_bio(sio); sio->error = -ESTALE; sio->sc->writeback_cancelled++; } else { /* Start writing to cache device */ ssdcache_get_sio(sio); cte_start_cache_write(sio); write_to_cache(sio, sio->writeback_bio); } } else { WPRINTK(sio, "invalid workqueue state"); } ssdcache_put_sio(sio); goto retry; } } static int ssdcache_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) { struct ssdcache_ctx *sc = ti->private; struct ssdcache_io *sio; /* We don't support DISCARD or SECURE_DISCARD (yet) */ if (bio->bi_rw & (REQ_DISCARD | REQ_SECURE)) { bio->bi_bdev = sc->target_dev->bdev; map_context->ptr = NULL; return -EOPNOTSUPP; } if (bio_cur_bytes(bio) > to_bytes(sc->block_size)) { DPRINTK("bio size %u larger than block size", bio_cur_bytes(bio)); sc->cache_bypassed++; bio->bi_bdev = sc->target_dev->bdev; map_context->ptr = NULL; return DM_MAPIO_REMAPPED; } if (bio_cur_bytes(bio) == 0 && !(bio->bi_rw & REQ_FLUSH)) { DPRINTK("zero-sized bio (bi_rw %lx)", bio->bi_rw); sc->cache_bypassed++; bio->bi_bdev = sc->target_dev->bdev; map_context->ptr = NULL; return DM_MAPIO_REMAPPED; } sio = ssdcache_create_sio(sc); if (!sio) { DPRINTK("sio creation failure"); sio->sc->cache_bypassed++; bio->bi_bdev = sc->target_dev->bdev; ssdcache_put_sio(sio); return DM_MAPIO_REMAPPED; } sio->bio_sector = cte_bio_align(sc, bio); sio_bio_mask(sio, bio); map_context->ptr = sio; if (bio->bi_rw & REQ_FLUSH) { DPRINTK("%lu: %s: flush start", sio->nr, __FUNCTION__); #ifdef SSD_DEBUG sio_in_flight(); #endif sio->bio = bio; bio_get(bio); map_context->ptr = NULL; ssdcache_schedule_sio(sio); return DM_MAPIO_SUBMITTED; } if (sc->options.async_lookup && (bio_data_dir(bio) == WRITE)) { map_secondary_bio(sio, bio); ssdcache_schedule_sio(sio); if (CACHE_IS_WRITETHROUGH(sc)) { bio->bi_bdev = sc->target_dev->bdev; return DM_MAPIO_REMAPPED; } } switch (cte_match(sio, bio_data_dir(bio))) { case CTE_READ_CLEAN: /* Cache hit, cte clean */ #ifdef SSD_DEBUG WPRINTK(sio, "read hit clean %llx %u", (unsigned long long)bio->bi_sector, bio_cur_bytes(bio)); #endif bio->bi_bdev = sio->sc->cache_dev->bdev; bio->bi_sector = to_cache_sector(sio, bio->bi_sector); map_context->ptr = NULL; ssdcache_put_sio(sio); break; case CTE_READ_BUSY: WPRINTK(sio, "read hit busy %llx %u", (unsigned long long)bio->bi_sector, bio_cur_bytes(bio)); /* Do not start prefetching here, sector is already busy */ bio->bi_bdev = sc->target_dev->bdev; map_context->ptr = NULL; ssdcache_put_sio(sio); break; case CTE_READ_INVALID: #ifdef SSD_DEBUG WPRINTK(sio, "read invalid %llx %u", (unsigned long long)bio->bi_sector, bio_cur_bytes(bio)); #endif sio_start_prefetch(sio, bio); if (!sio->writeback_bio) { map_context->ptr = NULL; ssdcache_put_sio(sio); } break; case CTE_READ_MISS: #ifdef SSD_DEBUG WPRINTK(sio, "read miss %llx %u", (unsigned long long)bio->bi_sector, bio_cur_bytes(bio)); #endif sio_start_prefetch(sio, bio); if (!sio->writeback_bio) { map_context->ptr = NULL; ssdcache_put_sio(sio); } break; case CTE_LOOKUP_FAILED: DPRINTK("%lu: %s: lookup failure %llx %u", sio->nr, __FUNCTION__, (unsigned long long)bio->bi_sector, bio_cur_bytes(bio)); /* Fallthrough */ case CTE_WRITE_SKIP: bio->bi_bdev = sc->target_dev->bdev; map_context->ptr = NULL; ssdcache_put_sio(sio); break; case CTE_WRITE_DONE: /* Write to target already completed */ DPRINTK("%lu: %s: write target done", sio->nr, __FUNCTION__); map_context->ptr = NULL; bio_endio(bio, sio->error); ssdcache_put_sio(sio); return DM_MAPIO_SUBMITTED; break; case CTE_WRITE_BUSY: #ifdef SSD_DEBUG WPRINTK(sio, "write hit busy %llx %u", (unsigned long long)bio->bi_sector, bio_cur_bytes(bio)); #endif sio_start_write_busy(sio, bio); if (sc->options.queue_busy && !sio->writeback_bio) { map_context->ptr = NULL; ssdcache_put_sio(sio); } break; case CTE_WRITE_CANCEL: WPRINTK(sio, "write hit cancel %llx %u", (unsigned long long)bio->bi_sector, bio_cur_bytes(bio)); map_context->ptr = NULL; ssdcache_put_sio(sio); break; case CTE_WRITE_INVALID: #ifdef SSD_DEBUG WPRINTK(sio, "write hit invalid %llx %u", (unsigned long long)bio->bi_sector, bio_cur_bytes(bio)); #endif sio_start_write_miss(sio, bio); break; case CTE_WRITE_CLEAN: #ifdef SSD_DEBUG WPRINTK(sio, "write hit clean %llx %u", (unsigned long long)bio->bi_sector, bio_cur_bytes(bio)); #endif sio_start_write_miss(sio, bio); break; case CTE_WRITE_MISS: #ifdef SSD_DEBUG WPRINTK(sio, "write miss %llx %u", (unsigned long long)bio->bi_sector, bio_cur_bytes(bio)); #endif sio_start_write_miss(sio, bio); break; } return DM_MAPIO_REMAPPED; } static int ssdcache_endio(struct dm_target *ti, struct bio *bio, int error, union map_info *map_context) { struct ssdcache_io *sio = map_context->ptr; if (!sio) return error; if (!sio->cmd || sio->cte_idx == -1) { #ifdef SSD_DEBUG DPRINTK("%lu: %s: cte lookup not finished", sio->nr, __FUNCTION__); #endif sio->sc->cache_overruns++; sio->error = error ? error : -EUCLEAN; goto out; } if (error) { WPRINTK(sio, "finished with %u", error); sio->error = error; } if (sio->error) { WPRINTK(sio, "error %d", sio->error); unmap_writeback_bio(sio); } if (bio_data_dir(bio) == WRITE) { bool to_cache = bio->bi_bdev == sio->sc->cache_dev->bdev; if (!sio_match_sector(sio)) { WPRINTK(sio, "cte overrun, not updating state"); sio->sc->cache_overruns++; } else if (to_cache) { if (!sio_cache_is_busy(sio)) { WPRINTK(sio, "cache not busy, not updating"); sio->sc->cache_overruns++; } else { sio_finish_cache_write(sio); } } else { if (CACHE_IS_READCACHE(sio->sc)) { sio_finish_target_write(sio); } else if (!sio_target_is_busy(sio)) { WPRINTK(sio, "target not busy, not updating"); sio->sc->cache_overruns++; } else { sio_finish_target_write(sio); } } } if (sio->writeback_bio) { /* Kick off writeback */ ssdcache_schedule_sio(sio); } out: ssdcache_put_sio(sio); return error; } static int ssdcache_parse_options(struct dm_target *ti, struct dm_arg_set *as, struct ssdcache_ctx *sc) { int r; unsigned int argc; const char *opt_name; static struct dm_arg _args[] = { {0, 5, "invalid number of options"}, }; r = dm_read_arg_group(_args, as, &argc, &ti->error); if (r) return -EINVAL; if (!argc) return 0; do { opt_name = dm_shift_arg(as); argc--; if (!strcasecmp(opt_name, "lfu")) { sc->options.strategy = CACHE_LFU; continue; } if (!strcasecmp(opt_name, "lru")) { sc->options.strategy = CACHE_LRU; continue; } if (!strcasecmp(opt_name, "async_lookup")) { sc->options.async_lookup = 1; continue; } if (!strcasecmp(opt_name, "queue_busy")) { sc->options.queue_busy = 1; continue; } if (!strcasecmp(opt_name, "disable_writeback")) { sc->options.disable_writeback = 1; continue; } if (!strcasecmp(opt_name, "skip_write_insert")) { sc->options.skip_write_insert = 1; continue; } if (!strcasecmp(opt_name, "evict_on_write")) { sc->options.evict_on_write = 1; continue; } if (!strcasecmp(opt_name, "cmd_preload")) { sc->options.cmd_preload = 1; continue; } } while (argc); return 0; } void ssdcache_format_options(struct ssdcache_ctx *sc, char *optstr) { int optnum = 0; if (sc->options.strategy != default_cache_strategy) optnum++; if (sc->options.async_lookup) optnum++; if (sc->options.queue_busy) optnum++; if (sc->options.disable_writeback) optnum++; if (sc->options.skip_write_insert) optnum++; if (sc->options.evict_on_write) optnum++; if (sc->options.cmd_preload) optnum++; if (!optnum) { optstr[0] = '\0'; return; } sprintf(optstr," options %d ", optnum); if (sc->options.strategy != default_cache_strategy) { if (sc->options.strategy == CACHE_LFU) strcat(optstr, "lfu "); else strcat(optstr, "lru "); } if (sc->options.async_lookup) strcat(optstr, "async_lookup "); if (sc->options.queue_busy) strcat(optstr, "queue_busy "); if (sc->options.disable_writeback) strcat(optstr, "disable_writeback "); if (sc->options.skip_write_insert) strcat(optstr, "skip_write_insert "); if (sc->options.evict_on_write) strcat(optstr, "evict_on_write "); if (sc->options.cmd_preload) strcat(optstr, "cmd_preload "); optstr[strlen(optstr)] = '\0'; } /* * Construct a ssdcache mapping: <target_dev_path> <cache_dev_path> */ static int ssdcache_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct ssdcache_ctx *sc; struct dm_arg_set as; const char *devname; const char *argname; unsigned long num_cmd; unsigned long cdev_size; unsigned long long tdev_size; int r = 0; as.argc = argc; as.argv = argv; sc = kzalloc(sizeof(*sc), GFP_KERNEL); if (sc == NULL) { ti->error = "dm-ssdcache: Cannot allocate ssdcache context"; return -ENOMEM; } devname = dm_shift_arg(&as); if (!devname) { ti->error = "dm-ssdcache: Target device is not specified"; r = -EINVAL; goto bad; } if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &sc->target_dev)) { ti->error = "dm-ssdcache: Target device lookup failed"; r = -EINVAL; goto bad; } devname = dm_shift_arg(&as); if (!devname) { ti->error = "dm-ssdcache: Cache device is not specified"; r = -EINVAL; goto bad; } if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &sc->cache_dev)) { ti->error = "dm-ssdcache: Cache device lookup failed"; dm_put_device(ti, sc->target_dev); r = -EINVAL; goto bad; } sc->block_size = DEFAULT_BLOCKSIZE; sc->options.strategy = default_cache_strategy; sc->options.mode = default_cache_mode; sc->options.assoc = DEFAULT_ASSOCIATIVITY; while ((argname = dm_shift_arg(&as)) != NULL) { if (!strcasecmp(argname, "blocksize")) { if (sscanf(dm_shift_arg(&as), "%lu", &sc->block_size) != 1) { ti->error = "Invalid blocksize"; } if (sc->block_size < 1) { ti->error = "blocksize too small"; sc->block_size = DEFAULT_BLOCKSIZE; } } else if (!strcasecmp(argname, "assoc")) { if (sscanf(dm_shift_arg(&as), "%d", &sc->options.assoc) != 1) { ti->error = "Invalid associativity"; } if (sc->block_size < 1) { ti->error = "Associativity must be at least 1"; sc->block_size = DEFAULT_ASSOCIATIVITY; } } else if (!strcasecmp(argname, "writeback")) { sc->options.mode = CACHE_MODE_WRITEBACK; } else if (!strcasecmp(argname, "writethrough")) { sc->options.mode = CACHE_MODE_WRITETHROUGH; } else if (!strcasecmp(argname, "readcache")) { sc->options.mode = CACHE_MODE_READCACHE; } else if (!strcasecmp(argname, "options")) { if (ssdcache_parse_options(ti, &as, sc)) { r = -EINVAL; goto bad_io_client; } } else { ti->error = "Invalid argument"; r = -EINVAL; goto bad_io_client; } } sc->iocp = dm_io_client_create(); if (IS_ERR(sc->iocp)) { r = PTR_ERR(sc->iocp); ti->error = "Failed to create io client\n"; goto bad_io_client; } spin_lock_init(&sc->cmd_lock); cdev_size = i_size_read(sc->cache_dev->bdev->bd_inode); tdev_size = i_size_read(sc->target_dev->bdev->bd_inode); num_cmd = cdev_size / to_bytes(sc->block_size) / DEFAULT_ALIASING; /* * Hash bit calculation might return a lower number * for the possible number of ctes, so adjust that * as well. */ sc->hash_bits = fls(num_cmd) - 1; num_cmd = (1UL << sc->hash_bits); DPRINTK("block size %lu, hash bits %lu, num cmd %lu", to_bytes(sc->block_size), sc->hash_bits, num_cmd); #ifdef SSDCACHE_USE_RADIX_TREE INIT_RADIX_TREE(&sc->md_tree, GFP_ATOMIC); #else sc->md_table = vmalloc(num_cmd * sizeof(struct ssdcache_md *)); memset(sc->md_table, 0, num_cmd * sizeof(struct ssdcache_md *)); #endif sc->data_offset = 0; sc->block_mask = sc->block_size - 1; sc->nr_sio = 0; if (sc->options.cmd_preload) { int i; for (i = 0; i < num_cmd; i++) cmd_insert(sc, i); } ti->num_flush_requests = 1; ti->num_discard_requests = 1; ti->private = sc; ti->split_io = sc->block_size; return 0; bad_io_client: dm_put_device(ti, sc->target_dev); dm_put_device(ti, sc->cache_dev); bad: kfree(sc); return r; } static void ssdcache_dtr(struct dm_target *ti) { struct ssdcache_ctx *sc = (struct ssdcache_ctx *) ti->private; #ifdef SSDCACHE_USE_RADIX_TREE unsigned long pos = 0, nr_cmds; struct ssdcache_md *cmds[MIN_CMD_NUM]; #endif struct ssdcache_md *cmd; int i; #ifdef SSDCACHE_USE_RADIX_TREE do { spin_lock(&sc->cmd_lock); nr_cmds = radix_tree_gang_lookup(&sc->md_tree, (void **)cmds, pos, MIN_CMD_NUM); for (i = 0; i < nr_cmds; i++) { pos = cmds[i]->hash; cmd = radix_tree_delete(&sc->md_tree, pos); spin_unlock(&sc->cmd_lock); cmd_remove(cmd); spin_lock(&sc->cmd_lock); } spin_unlock(&sc->cmd_lock); pos++; } while (nr_cmds == MIN_CMD_NUM); #else for (i = 0; i < (1UL << sc->hash_bits); i++) { spin_lock(&sc->cmd_lock); cmd = sc->md_table[i]; rcu_assign_pointer(sc->md_table[i], NULL); spin_unlock(&sc->cmd_lock); synchronize_rcu(); cmd_remove(cmd); } vfree(sc->md_table); #endif dm_io_client_destroy(sc->iocp); dm_put_device(ti, sc->target_dev); dm_put_device(ti, sc->cache_dev); kfree(sc); } static int ssdcache_status(struct dm_target *ti, status_type_t type, char *result, unsigned int maxlen) { struct ssdcache_ctx *sc = (struct ssdcache_ctx *) ti->private; #ifdef SSDCACHE_USE_RADIX_TREE unsigned long nr_elems, pos = 0; struct ssdcache_md *cmds[MIN_CMD_NUM]; #endif struct ssdcache_md *cmd; struct ssdcache_te *cte; char optstr[512], modestr[64]; unsigned long nr_cmds = 0, nr_ctes = 0; unsigned long nr_cache_busy = 0, nr_target_busy = 0, nr_cte_full = 0; int i, j; rcu_read_lock(); #ifdef SSDCACHE_USE_RADIX_TREE do { nr_elems = radix_tree_gang_lookup(&sc->md_tree, (void **)cmds, pos, MIN_CMD_NUM); for (i = 0; i < nr_elems; i++) { cmd = cmds[i]; pos = cmd->hash; nr_cmds++; for (j = 0; j < cmd->num_cte; j++) { cte = rcu_dereference(cmd->te[j]); if (cte) { nr_ctes++; if (!bitmap_empty(cte->target_busy, DEFAULT_BLOCKSIZE)) nr_target_busy++; if (!bitmap_empty(cte->cache_busy, DEFAULT_BLOCKSIZE)) nr_cache_busy++; if (bitmap_full(cte->clean, DEFAULT_BLOCKSIZE)) nr_cte_full++; } } } pos++; } while (nr_elems == MIN_CMD_NUM); #else for (i = 0; i < (1UL << sc->hash_bits); i++) { cmd = rcu_dereference(sc->md_table[i]); if (cmd) { nr_cmds++; for (j = 0; j < cmd->num_cte; j++) { cte = rcu_dereference(cmd->te[j]); if (cte) { nr_ctes++; if (!bitmap_empty(cte->target_busy, DEFAULT_BLOCKSIZE)) nr_target_busy++; if (!bitmap_empty(cte->cache_busy, DEFAULT_BLOCKSIZE)) nr_cache_busy++; if (bitmap_full(cte->clean, DEFAULT_BLOCKSIZE)) nr_cte_full++; } } } } #endif rcu_read_unlock(); switch (type) { case STATUSTYPE_INFO: snprintf(result, maxlen, "cmd %lu/%lu cte %lu/%lu/%lu\n" "\tread clean %lu busy %lu invalid %lu miss %lu\n" "\twrite busy %lu cancel %lu invalid %lu " "miss %lu done %lu skip %lu\n" "\tbypassed %lu evicts %lu cancelled writeback %lu bio %lu", nr_cmds, (1UL << sc->hash_bits), nr_cte_full, nr_ctes, (1UL << sc->hash_bits) * DEFAULT_ALIASING, sc->read_clean, sc->read_busy, sc->read_invalid, sc->read_miss, sc->write_busy, sc->write_cancel, sc->write_invalid, sc->write_miss, sc->write_done, sc->write_skip, sc->cache_bypassed, sc->cache_evictions, sc->writeback_cancelled, sc->bio_cancelled); break; case STATUSTYPE_TABLE: if (sc->options.mode == CACHE_MODE_WRITEBACK) strcat(modestr, "writeback"); else if (sc->options.mode == CACHE_MODE_READCACHE) strcat(modestr, "readcache"); else strcat(modestr, "writethrough"); ssdcache_format_options(sc, optstr); snprintf(result, maxlen, "%s %s blocksize %lu %s assoc %d%s", sc->target_dev->name, sc->cache_dev->name, sc->block_size, modestr, sc->options.assoc, optstr); break; } return 0; } static int ssdcache_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct ssdcache_ctx *sc = ti->private; if (!sc) return 0; return fn(ti, sc->target_dev, 0, ti->len, data); } static struct target_type ssdcache_target = { .name = "ssdcache", .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = ssdcache_ctr, .dtr = ssdcache_dtr, .map = ssdcache_map, .end_io = ssdcache_endio, .status = ssdcache_status, .iterate_devices = ssdcache_iterate_devices, }; int __init dm_ssdcache_init(void) { int r; r = pool_init(); if (r < 0) { DMERR("kmempool allocation failed: %d", r); return r; } _ssdcached_wq = create_singlethread_workqueue("kssdcached"); if (!_ssdcached_wq) { DMERR("failed to start kssdcached"); pool_exit(); return -ENOMEM; } INIT_WORK(&_ssdcached_work, process_sio); r = dm_register_target(&ssdcache_target); if (r < 0) { DMERR("register failed %d", r); destroy_workqueue(_ssdcached_wq); pool_exit(); } return r; } void dm_ssdcache_exit(void) { pool_exit(); destroy_workqueue(_ssdcached_wq); dm_unregister_target(&ssdcache_target); } module_init(dm_ssdcache_init); module_exit(dm_ssdcache_exit); MODULE_DESCRIPTION(DM_NAME " cache target"); MODULE_AUTHOR("Hannes Reinecke <hare@xxxxxxx>"); MODULE_LICENSE("GPL");
-- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel