Hi Alasdair
Here I'm sending the patch for buffered IO. The purpose of this subsystem
is to build a simple buffer cache that can be used by other dm targets.
You can put it to your patchset, below mm line (not yet to be sent to
Linus), so that Fujita and others can see it use it.
If someone needs some extensions to this layer (such as for example the
ability to hold more buffers simultaneously, or using buffers
significantly larger than page size), write me, I can discuss it and make
the changes.
CCing also Heinz, maybe he finds use for this layer in the replicator,
maybe not, have look at it.
Mikulas
Buffered IO interface.
Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx>
---
drivers/md/Makefile | 2
drivers/md/dm-bufio.c | 510 +++++++++++++++++++++++++++++++++++++++++++++++
include/linux/dm-bufio.h | 20 +
3 files changed, 531 insertions(+), 1 deletion(-)
Index: linux-2.6.28-rc2-devel/drivers/md/dm-bufio.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.28-rc2-devel/drivers/md/dm-bufio.c 2008-11-05 04:59:04.000000000 +0100
@@ -0,0 +1,510 @@
+/*
+ * Copyright (C) 2008 Red Hat Czech, s.r.o.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/slab.h>
+
+#include <linux/dm-bufio.h>
+
+/*
+ * dm_bufio_client_create --- create a buffered IO cache on a given device
+ * dm_bufio_client_destroy --- release a buffered IO cache
+ *
+ * dm_bufio_read --- read a given block from disk. Returns pointer to data.
+ * Returns a pointer to dm_buffer that can be used to release the buffer
+ * or to make it dirty.
+ * dm_bufio_new --- like dm_bufio_read, but don't read anything from the disk.
+ * It is expected that the caller initializes the buffer and marks it
+ * dirty.
+ * dm_bufio_release --- release a reference obtained with dm_bufio_read or
+ * dm_bufio_new. The data pointer and dm_buffer pointer is no longer valid
+ * after this call.
+ *
+ * WARNING: to avoid deadlocks, the thread can hold at most one buffer. Multiple
+ * threads can hold each one buffer simultaneously.
+ *
+ * dm_bufio_mark_buffer_dirty --- mark a buffer dirty. It should be called after
+ * the buffer is modified.
+ * dm_bufio_write_dirty_buffers --- write all dirty buffers. Guarantees that all
+ * dirty buffers created prior to this call are on disk when this call
+ * exits.
+ *
+ * In case of memory pressure, the buffer may be written after
+ * dm_bufio_mark_buffer_dirty, but before dm_bufio_write_dirty_buffers.
+ * So, dm_bufio_write_dirty_buffers guarantees that the buffer is on-disk,
+ * but the actual writing may occur earlier.
+ *
+ * dm_bufio_release_move --- like dm_bufio_release, and also move the buffer to
+ * the new block. dm_bufio_write_dirty_buffers is needed to commit the new
+ * block.
+ * dm_bufio_drop_buffers --- clear all buffers.
+ */
+
+#define THRESHOLD_MEMORY (8 * 1048576)
+#define LIMIT_MEMORY (1 * 1048576)
+
+#define DM_BUFIO_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head) / 2)
+#define DM_BUFIO_HASH(block) ((block) & (DM_BUFIO_HASH_SIZE - 1))
+
+#define B_READING 0
+#define B_WRITING 1
+#define B_DIRTY 2
+
+struct dm_bufio_client {
+ struct list_head lru;
+ struct list_head dirty_lru;
+ struct mutex lock;
+ struct block_device *bdev;
+ unsigned block_size;
+ unsigned char block_to_sector_shift;
+
+ unsigned long n_buffers;
+ unsigned threshold_buffers;
+ unsigned limit_buffers;
+
+ struct dm_buffer *reserved_buffer;
+ struct hlist_head cache_hash[DM_BUFIO_HASH_SIZE];
+ wait_queue_head_t free_buffer_wait;
+
+ int async_write_error;
+};
+
+struct dm_buffer {
+ struct hlist_node hash_list;
+ struct list_head lru_list;
+ sector_t block;
+ void *data;
+ unsigned hold_count;
+ int read_error;
+ int write_error;
+ unsigned long state;
+ struct dm_bufio_client *c;
+ struct bio bio;
+ struct bio_vec bio_vec;
+};
+
+static void *dm_bufio_alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask)
+{
+ return kmalloc(c->block_size, gfp_mask);
+}
+
+static void dm_bufio_free_buffer_data(struct dm_bufio_client *c, void *data)
+{
+ kfree(data);
+}
+
+
+static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
+{
+ struct dm_buffer *b;
+ b = kmalloc(sizeof(struct dm_buffer), gfp_mask);
+ if (!b)
+ return NULL;
+ b->c = c;
+ b->data = dm_bufio_alloc_buffer_data(c, gfp_mask);
+ if (!b->data) {
+ kfree(b);
+ return NULL;
+ }
+ return b;
+}
+
+static void free_buffer(struct dm_buffer *b)
+{
+ dm_bufio_free_buffer_data(b->c, b->data);
+ kfree(b);
+}
+
+
+static void link_buffer(struct dm_buffer *b, sector_t block, int dirty)
+{
+ struct dm_bufio_client *c = b->c;
+ c->n_buffers++;
+ b->block = block;
+ list_add(&b->lru_list, dirty ? &c->dirty_lru : &c->lru);
+ hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
+}
+
+static void unlink_buffer(struct dm_buffer *b)
+{
+ BUG_ON(!b->c->n_buffers);
+ b->c->n_buffers--;
+ hlist_del(&b->hash_list);
+ list_del(&b->lru_list);
+}
+
+
+static int just_io_schedule(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
+static void write_dirty_buffer(struct dm_buffer *b);
+
+static struct dm_buffer *get_unclaimed_buffer(struct dm_bufio_client *c, int wait)
+{
+ struct dm_buffer *b;
+ list_for_each_entry_reverse(b, &c->lru, lru_list) {
+ cond_resched();
+ BUG_ON(test_bit(B_WRITING, &b->state));
+ BUG_ON(test_bit(B_DIRTY, &b->state));
+ if (!b->hold_count) {
+ if (unlikely(test_bit(B_READING, &b->state))) {
+ if (!wait)
+ continue;
+ wait_on_bit(&b->state, B_READING, just_io_schedule, TASK_UNINTERRUPTIBLE);
+ }
+ unlink_buffer(b);
+ return b;
+ }
+ }
+ list_for_each_entry_reverse(b, &c->dirty_lru, lru_list) {
+ cond_resched();
+ BUG_ON(test_bit(B_READING, &b->state));
+ if (!b->hold_count) {
+ if (unlikely(test_bit(B_DIRTY, &b->state))) {
+ if (unlikely(test_bit(B_WRITING, &b->state)) && !wait)
+ write_dirty_buffer(b);
+ if (!wait)
+ continue;
+ }
+ if (unlikely(test_bit(B_WRITING, &b->state))) {
+ if (!wait)
+ continue;
+ wait_on_bit(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE);
+ }
+ unlink_buffer(b);
+ return b;
+ }
+ }
+ return NULL;
+}
+
+static struct dm_buffer *alloc_buffer_wait(struct dm_bufio_client *c)
+{
+ struct dm_buffer *b;
+ DECLARE_WAITQUEUE(wait, current);
+
+retry:
+ b = alloc_buffer(c, GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN);
+ if (b)
+ return b;
+
+ if (c->reserved_buffer) {
+ b = c->reserved_buffer;
+ c->reserved_buffer = NULL;
+ return b;
+ }
+
+ b = get_unclaimed_buffer(c, 1);
+ if (b)
+ return b;
+
+ add_wait_queue(&c->free_buffer_wait, &wait);
+ set_task_state(current, TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&c->lock);
+ io_schedule();
+ set_task_state(current, TASK_RUNNING);
+ remove_wait_queue(&c->free_buffer_wait, &wait);
+ mutex_lock(&c->lock);
+ goto retry;
+}
+
+static void free_buffer_wake(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+ if (!c->reserved_buffer) c->reserved_buffer = b;
+ else free_buffer(b);
+ wake_up(&c->free_buffer_wait);
+ cond_resched();
+}
+
+static void check_watermark(struct dm_bufio_client *c)
+{
+ while (c->n_buffers > c->threshold_buffers) {
+ struct dm_buffer *b;
+ if (!(b = get_unclaimed_buffer(c, c->n_buffers > c->limit_buffers)))
+ return;
+ free_buffer_wake(b);
+ }
+}
+
+static void read_endio(struct bio *bio, int error);
+
+static void dm_bufio_setup_bio(struct dm_buffer *b, sector_t block, bio_end_io_t *end_io)
+{
+ bio_init(&b->bio);
+ b->bio.bi_io_vec = &b->bio_vec;
+ b->bio.bi_max_vecs = 1;
+ b->bio.bi_sector = b->block << b->c->block_to_sector_shift;
+ b->bio.bi_bdev = b->c->bdev;
+ b->bio.bi_end_io = end_io;
+ if (!bio_add_page(&b->bio, virt_to_page(b->data), b->c->block_size, virt_to_phys(b->data) & (PAGE_SIZE - 1)))
+ BUG();
+}
+
+static void *dm_bufio_new_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp, int read)
+{
+ struct dm_buffer *b, *new_b = NULL;
+ struct hlist_node *hn;
+
+ cond_resched();
+ mutex_lock(&c->lock);
+retry_search:
+ hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)], hash_list) {
+ if (b->block == block) {
+ if (new_b)
+ free_buffer_wake(new_b);
+ b->hold_count++;
+ list_del(&b->lru_list);
+ list_add(&b->lru_list, test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state) ? &c->dirty_lru : &c->lru);
+unlock_wait_ret:
+ mutex_unlock(&c->lock);
+wait_ret:
+ wait_on_bit(&b->state, B_READING, just_io_schedule, TASK_UNINTERRUPTIBLE);
+ if (b->read_error) {
+ int error = b->read_error;
+ dm_bufio_release(b);
+ return ERR_PTR(error);
+ }
+ *bp = b;
+ return b->data;
+ }
+ cond_resched();
+ }
+ if (!new_b) {
+ new_b = alloc_buffer_wait(c);
+ goto retry_search;
+ }
+ b = new_b;
+ b->hold_count = 1;
+ b->read_error = 0;
+ b->write_error = 0;
+ link_buffer(b, block, 0);
+
+ check_watermark(c);
+
+ if (!read) {
+ b->state = 0;
+ goto unlock_wait_ret;
+ }
+
+ b->state = 1 << B_READING;
+
+ mutex_unlock(&c->lock);
+
+ dm_bufio_setup_bio(b, b->block, read_endio);
+ submit_bio(READ, &b->bio);
+
+ goto wait_ret;
+}
+
+void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp)
+{
+ return dm_bufio_new_read(c, block, bp, 1);
+}
+EXPORT_SYMBOL(dm_bufio_read);
+
+void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp)
+{
+ return dm_bufio_new_read(c, block, bp, 0);
+}
+EXPORT_SYMBOL(dm_bufio_new);
+
+static void read_endio(struct bio *bio, int error)
+{
+ struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
+ b->read_error = error;
+ BUG_ON(!test_bit(B_READING, &b->state));
+ smp_mb__before_clear_bit();
+ clear_bit(B_READING, &b->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&b->state, B_READING);
+}
+
+void dm_bufio_release(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+ mutex_lock(&c->lock);
+ BUG_ON(!b->hold_count);
+ BUG_ON(test_bit(B_READING, &b->state));
+ b->hold_count--;
+ if (!b->hold_count) {
+ wake_up(&c->free_buffer_wait);
+ if ((b->read_error || b->write_error) && !test_bit(B_WRITING, &b->state) && !test_bit(B_DIRTY, &b->state)) {
+ unlink_buffer(b);
+ free_buffer_wake(b);
+ }
+ }
+ mutex_unlock(&c->lock);
+}
+EXPORT_SYMBOL(dm_bufio_release);
+
+void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
+{
+ struct dm_bufio_client *c = b->c;
+ mutex_lock(&c->lock);
+ if (!test_and_set_bit(B_DIRTY, &b->state)) {
+ list_del(&b->lru_list);
+ list_add(&b->lru_list, &c->dirty_lru);
+ }
+ mutex_unlock(&c->lock);
+}
+EXPORT_SYMBOL(dm_bufio_mark_buffer_dirty);
+
+static void write_endio(struct bio *bio, int error);
+
+static void write_dirty_buffer(struct dm_buffer *b)
+{
+ if (!test_bit(B_DIRTY, &b->state))
+ return;
+ clear_bit(B_DIRTY, &b->state);
+ wait_on_bit_lock(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE);
+ dm_bufio_setup_bio(b, b->block, write_endio);
+ submit_bio(WRITE, &b->bio);
+}
+
+static void write_endio(struct bio *bio, int error)
+{
+ struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
+ b->write_error = error;
+ if (unlikely(error)) {
+ struct dm_bufio_client *c = b->c;
+ cmpxchg(&c->async_write_error, 0, error);
+ }
+ BUG_ON(!test_bit(B_WRITING, &b->state));
+ smp_mb__before_clear_bit();
+ clear_bit(B_WRITING, &b->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&b->state, B_WRITING);
+}
+
+static void write_dirty_buffers_async(struct dm_bufio_client *c)
+{
+ struct dm_buffer *b;
+ list_for_each_entry_reverse(b, &c->dirty_lru, lru_list) {
+ cond_resched();
+ BUG_ON(test_bit(B_READING, &b->state));
+ write_dirty_buffer(b);
+ }
+}
+
+int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
+{
+ struct dm_buffer *b;
+ mutex_lock(&c->lock);
+ write_dirty_buffers_async(c);
+ mutex_unlock(&c->lock);
+ mutex_lock(&c->lock);
+ list_for_each_entry_reverse(b, &c->dirty_lru, lru_list) {
+ cond_resched();
+ BUG_ON(test_bit(B_READING, &b->state));
+ if (test_bit(B_WRITING, &b->state)) {
+ b->hold_count++;
+ mutex_unlock(&c->lock);
+ wait_on_bit(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE);
+ mutex_lock(&c->lock);
+ b->hold_count--;
+ }
+ }
+ wake_up(&c->free_buffer_wait);
+ mutex_unlock(&c->lock);
+ return xchg(&c->async_write_error, 0);
+}
+EXPORT_SYMBOL(dm_bufio_write_dirty_buffers);
+
+void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
+{
+ struct dm_bufio_client *c = b->c;
+ mutex_lock(&c->lock);
+ BUG_ON(!b->hold_count);
+ BUG_ON(test_bit(B_READING, &b->state));
+ write_dirty_buffer(b);
+ if (b->hold_count == 1) {
+ wait_on_bit(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE);
+ set_bit(B_DIRTY, &b->state);
+ unlink_buffer(b);
+ link_buffer(b, new_block, 1);
+ } else {
+ wait_on_bit_lock(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE);
+ dm_bufio_setup_bio(b, new_block, write_endio);
+ submit_bio(WRITE, &b->bio);
+ wait_on_bit(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE);
+ }
+ mutex_unlock(&c->lock);
+ dm_bufio_release(b);
+}
+EXPORT_SYMBOL(dm_bufio_release_move);
+
+void dm_bufio_drop_buffers(struct dm_bufio_client *c)
+{
+ struct dm_buffer *b;
+ mutex_lock(&c->lock);
+ write_dirty_buffers_async(c);
+ while ((b = get_unclaimed_buffer(c, 1)))
+ free_buffer_wake(b);
+ BUG_ON(!list_empty(&c->lru));
+ BUG_ON(!list_empty(&c->dirty_lru));
+ mutex_unlock(&c->lock);
+}
+EXPORT_SYMBOL(dm_bufio_drop_buffers);
+
+struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size)
+{
+ int r;
+ struct dm_bufio_client *c;
+ unsigned i;
+
+ BUG_ON(!block_size || (block_size & (block_size - 1)));
+
+ c = kmalloc(sizeof(*c), GFP_KERNEL);
+ if (!c) {
+ r = -ENOMEM;
+ goto bad_client;
+ }
+
+ c->bdev = bdev;
+ c->block_size = block_size;
+ c->block_to_sector_shift = ffs(block_size) - 1 - SECTOR_SHIFT;
+ INIT_LIST_HEAD(&c->lru);
+ INIT_LIST_HEAD(&c->dirty_lru);
+ for (i = 0; i < DM_BUFIO_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&c->cache_hash[i]);
+ mutex_init(&c->lock);
+ c->n_buffers = 0;
+ c->threshold_buffers = THRESHOLD_MEMORY / c->block_size + 1;
+ c->limit_buffers = (THRESHOLD_MEMORY + LIMIT_MEMORY) / c->block_size + 1;
+ init_waitqueue_head(&c->free_buffer_wait);
+ c->async_write_error = 0;
+
+ c->reserved_buffer = alloc_buffer(c, GFP_KERNEL);
+ if (!c->reserved_buffer) {
+ r = -ENOMEM;
+ goto bad_buffer;
+ }
+
+ return c;
+
+bad_buffer:
+ kfree(c);
+bad_client:
+ return ERR_PTR(r);
+}
+EXPORT_SYMBOL(dm_bufio_client_create);
+
+void dm_bufio_client_destroy(struct dm_bufio_client *c)
+{
+ unsigned i;
+ dm_bufio_drop_buffers(c);
+ for (i = 0; i < DM_BUFIO_HASH_SIZE; i++)
+ BUG_ON(!hlist_empty(&c->cache_hash[i]));
+ BUG_ON(!c->reserved_buffer);
+ free_buffer(c->reserved_buffer);
+ BUG_ON(c->n_buffers != 0);
+ kfree(c);
+}
+EXPORT_SYMBOL(dm_bufio_client_destroy);
Index: linux-2.6.28-rc2-devel/drivers/md/Makefile
===================================================================
--- linux-2.6.28-rc2-devel.orig/drivers/md/Makefile 2008-11-04 05:41:38.000000000 +0100
+++ linux-2.6.28-rc2-devel/drivers/md/Makefile 2008-11-05 04:59:10.000000000 +0100
@@ -3,7 +3,7 @@
#
dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
- dm-ioctl.o dm-io.o dm-kcopyd.o
+ dm-ioctl.o dm-io.o dm-kcopyd.o dm-bufio.o
dm-multipath-objs := dm-path-selector.o dm-mpath.o
dm-snapshot-objs := dm-snap.o dm-exception-store.o
dm-mirror-objs := dm-raid1.o
Index: linux-2.6.28-rc2-devel/include/linux/dm-bufio.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.28-rc2-devel/include/linux/dm-bufio.h 2008-11-04 05:43:35.000000000 +0100
@@ -0,0 +1,20 @@
+#ifndef _LINUX_DM_BUFIO_H
+#define _LINUX_DM_BUFIO_H
+
+struct dm_bufio_client;
+struct dm_buffer;
+
+void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp);
+void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp);
+void dm_bufio_release(struct dm_buffer *b);
+
+void dm_bufio_mark_buffer_dirty(struct dm_buffer *b);
+int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c);
+
+void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
+
+struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size);
+void dm_bufio_client_destroy(struct dm_bufio_client *c);
+void dm_bufio_drop_buffers(struct dm_bufio_client *c);
+
+#endif
--
dm-devel mailing list
dm-devel@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/dm-devel