Buffered IO interface. Signed-off-by: Mikulas Patocka --- drivers/md/Makefile | 2 drivers/md/dm-bufio.c | 516 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/dm-bufio.h | 20 + 3 files changed, 537 insertions(+), 1 deletion(-) Index: linux-2.6.29-rc1-devel/drivers/md/dm-bufio.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.29-rc1-devel/drivers/md/dm-bufio.c 2009-01-15 18:26:49.000000000 +0100 @@ -0,0 +1,516 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include +#include + +#include + +/* + * dm_bufio_client_create --- create a buffered IO cache on a given device + * dm_bufio_client_destroy --- release a buffered IO cache + * + * dm_bufio_read --- read a given block from disk. Returns pointer to data. + * Returns a pointer to dm_buffer that can be used to release the buffer + * or to make it dirty. + * dm_bufio_new --- like dm_bufio_read, but don't read anything from the disk. + * It is expected that the caller initializes the buffer and marks it + * dirty. + * dm_bufio_release --- release a reference obtained with dm_bufio_read or + * dm_bufio_new. The data pointer and dm_buffer pointer is no longer valid + * after this call. + * + * WARNING: to avoid deadlocks, the thread can hold at most one buffer. Multiple + * threads can hold each one buffer simultaneously. + * + * dm_bufio_mark_buffer_dirty --- mark a buffer dirty. It should be called after + * the buffer is modified. + * dm_bufio_write_dirty_buffers --- write all dirty buffers. Guarantees that all + * dirty buffers created prior to this call are on disk when this call + * exits. + * + * In case of memory pressure, the buffer may be written after + * dm_bufio_mark_buffer_dirty, but before dm_bufio_write_dirty_buffers. + * So, dm_bufio_write_dirty_buffers guarantees that the buffer is on-disk, + * but the actual writing may occur earlier. + * + * dm_bufio_release_move --- like dm_bufio_release, and also move the buffer to + * the new block. dm_bufio_write_dirty_buffers is needed to commit the new + * block. + * dm_bufio_drop_buffers --- clear all buffers. + */ + +#define THRESHOLD_MEMORY (8 * 1048576) +#define LIMIT_MEMORY (1 * 1048576) + +#define DM_BUFIO_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head) / 2) +#define DM_BUFIO_HASH(block) ((block) & (DM_BUFIO_HASH_SIZE - 1)) + +#define B_READING 0 +#define B_WRITING 1 +#define B_DIRTY 2 + +struct dm_bufio_client { + struct list_head lru; + struct list_head dirty_lru; + struct mutex lock; + struct block_device *bdev; + unsigned block_size; + unsigned char block_to_sector_shift; + + unsigned long n_buffers; + unsigned threshold_buffers; + unsigned limit_buffers; + + struct dm_buffer *reserved_buffer; + struct hlist_head cache_hash[DM_BUFIO_HASH_SIZE]; + wait_queue_head_t free_buffer_wait; + + int async_write_error; +}; + +struct dm_buffer { + struct hlist_node hash_list; + struct list_head lru_list; + sector_t block; + void *data; + unsigned hold_count; + int read_error; + int write_error; + unsigned long state; + struct dm_bufio_client *c; + struct bio bio; + struct bio_vec bio_vec; +}; + +static void *dm_bufio_alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask) +{ + return kmalloc(c->block_size, gfp_mask); +} + +static void dm_bufio_free_buffer_data(struct dm_bufio_client *c, void *data) +{ + kfree(data); +} + + +static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) +{ + struct dm_buffer *b; + b = kmalloc(sizeof(struct dm_buffer), gfp_mask); + if (!b) + return NULL; + b->c = c; + b->data = dm_bufio_alloc_buffer_data(c, gfp_mask); + if (!b->data) { + kfree(b); + return NULL; + } + return b; +} + +static void free_buffer(struct dm_buffer *b) +{ + dm_bufio_free_buffer_data(b->c, b->data); + kfree(b); +} + + +static void link_buffer(struct dm_buffer *b, sector_t block, int dirty) +{ + struct dm_bufio_client *c = b->c; + c->n_buffers++; + b->block = block; + list_add(&b->lru_list, dirty ? &c->dirty_lru : &c->lru); + hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); +} + +static void unlink_buffer(struct dm_buffer *b) +{ + BUG_ON(!b->c->n_buffers); + b->c->n_buffers--; + hlist_del(&b->hash_list); + list_del(&b->lru_list); +} + + +static int just_io_schedule(void *word) +{ + io_schedule(); + return 0; +} + +static void write_dirty_buffer(struct dm_buffer *b); + +static struct dm_buffer *get_unclaimed_buffer(struct dm_bufio_client *c, int wait) +{ + struct dm_buffer *b; + list_for_each_entry_reverse(b, &c->lru, lru_list) { + cond_resched(); + if (test_bit(B_WRITING, &b->state) || test_bit(B_DIRTY, &b->state)) { + printk("%p: state %lx, hold %d, block %lx\n", b, b->state, b->hold_count, b->block); + } + BUG_ON(test_bit(B_WRITING, &b->state)); + BUG_ON(test_bit(B_DIRTY, &b->state)); + if (!b->hold_count) { + if (unlikely(test_bit(B_READING, &b->state))) { + if (!wait) + continue; + wait_on_bit(&b->state, B_READING, just_io_schedule, TASK_UNINTERRUPTIBLE); + } + unlink_buffer(b); + return b; + } + } + list_for_each_entry_reverse(b, &c->dirty_lru, lru_list) { + cond_resched(); + BUG_ON(test_bit(B_READING, &b->state)); + if (!b->hold_count) { + if (unlikely(test_bit(B_DIRTY, &b->state))) { + if (unlikely(test_bit(B_WRITING, &b->state)) && !wait) + write_dirty_buffer(b); + if (!wait) + continue; + } + if (unlikely(test_bit(B_WRITING, &b->state))) { + if (!wait) + continue; + wait_on_bit(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE); + } + unlink_buffer(b); + return b; + } + } + return NULL; +} + +static struct dm_buffer *alloc_buffer_wait(struct dm_bufio_client *c) +{ + struct dm_buffer *b; + DECLARE_WAITQUEUE(wait, current); + +retry: + b = alloc_buffer(c, GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN); + if (b) + return b; + + if (c->reserved_buffer) { + b = c->reserved_buffer; + c->reserved_buffer = NULL; + return b; + } + + b = get_unclaimed_buffer(c, 1); + if (b) + return b; + + add_wait_queue(&c->free_buffer_wait, &wait); + set_task_state(current, TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->lock); + io_schedule(); + set_task_state(current, TASK_RUNNING); + remove_wait_queue(&c->free_buffer_wait, &wait); + mutex_lock(&c->lock); + goto retry; +} + +static void free_buffer_wake(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + if (!c->reserved_buffer) c->reserved_buffer = b; + else free_buffer(b); + wake_up(&c->free_buffer_wait); + cond_resched(); +} + +static void check_watermark(struct dm_bufio_client *c) +{ + while (c->n_buffers > c->threshold_buffers) { + struct dm_buffer *b; + if (!(b = get_unclaimed_buffer(c, c->n_buffers > c->limit_buffers))) + return; + free_buffer_wake(b); + } +} + +static void read_endio(struct bio *bio, int error); + +static void dm_bufio_setup_bio(struct dm_buffer *b, sector_t block, bio_end_io_t *end_io) +{ + bio_init(&b->bio); + b->bio.bi_io_vec = &b->bio_vec; + b->bio.bi_max_vecs = 1; + b->bio.bi_sector = b->block << b->c->block_to_sector_shift; + b->bio.bi_bdev = b->c->bdev; + b->bio.bi_end_io = end_io; + if (!bio_add_page(&b->bio, virt_to_page(b->data), b->c->block_size, virt_to_phys(b->data) & (PAGE_SIZE - 1))) + BUG(); +} + +static void *dm_bufio_new_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp, int read) +{ + struct dm_buffer *b, *new_b = NULL; + struct hlist_node *hn; + + cond_resched(); + mutex_lock(&c->lock); +retry_search: + hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)], hash_list) { + if (b->block == block) { + if (new_b) + free_buffer_wake(new_b); + b->hold_count++; + list_del(&b->lru_list); + list_add(&b->lru_list, test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state) ? &c->dirty_lru : &c->lru); +unlock_wait_ret: + mutex_unlock(&c->lock); +wait_ret: + wait_on_bit(&b->state, B_READING, just_io_schedule, TASK_UNINTERRUPTIBLE); + if (b->read_error) { + int error = b->read_error; + dm_bufio_release(b); + return ERR_PTR(error); + } + *bp = b; + return b->data; + } + cond_resched(); + } + if (!new_b) { + new_b = alloc_buffer_wait(c); + goto retry_search; + } + + check_watermark(c); + + b = new_b; + b->hold_count = 1; + b->read_error = 0; + b->write_error = 0; + link_buffer(b, block, 0); + + if (!read) { + b->state = 0; + goto unlock_wait_ret; + } + + b->state = 1 << B_READING; + + mutex_unlock(&c->lock); + + dm_bufio_setup_bio(b, b->block, read_endio); + submit_bio(READ, &b->bio); + + goto wait_ret; +} + +void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp) +{ + return dm_bufio_new_read(c, block, bp, 1); +} +EXPORT_SYMBOL(dm_bufio_read); + +void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp) +{ + return dm_bufio_new_read(c, block, bp, 0); +} +EXPORT_SYMBOL(dm_bufio_new); + +static void read_endio(struct bio *bio, int error) +{ + struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); + b->read_error = error; + BUG_ON(!test_bit(B_READING, &b->state)); + smp_mb__before_clear_bit(); + clear_bit(B_READING, &b->state); + smp_mb__after_clear_bit(); + wake_up_bit(&b->state, B_READING); +} + +void dm_bufio_release(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + mutex_lock(&c->lock); + BUG_ON(!b->hold_count); + BUG_ON(test_bit(B_READING, &b->state)); + b->hold_count--; + if (!b->hold_count) { + wake_up(&c->free_buffer_wait); + if ((b->read_error || b->write_error) && !test_bit(B_WRITING, &b->state) && !test_bit(B_DIRTY, &b->state)) { + unlink_buffer(b); + free_buffer_wake(b); + } + } + mutex_unlock(&c->lock); +} +EXPORT_SYMBOL(dm_bufio_release); + +void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + mutex_lock(&c->lock); + if (!test_and_set_bit(B_DIRTY, &b->state)) { + list_del(&b->lru_list); + list_add(&b->lru_list, &c->dirty_lru); + } + mutex_unlock(&c->lock); +} +EXPORT_SYMBOL(dm_bufio_mark_buffer_dirty); + +static void write_endio(struct bio *bio, int error); + +static void write_dirty_buffer(struct dm_buffer *b) +{ + if (!test_bit(B_DIRTY, &b->state)) + return; + clear_bit(B_DIRTY, &b->state); + wait_on_bit_lock(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE); + dm_bufio_setup_bio(b, b->block, write_endio); + submit_bio(WRITE, &b->bio); +} + +static void write_endio(struct bio *bio, int error) +{ + struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); + b->write_error = error; + if (unlikely(error)) { + struct dm_bufio_client *c = b->c; + cmpxchg(&c->async_write_error, 0, error); + } + BUG_ON(!test_bit(B_WRITING, &b->state)); + smp_mb__before_clear_bit(); + clear_bit(B_WRITING, &b->state); + smp_mb__after_clear_bit(); + wake_up_bit(&b->state, B_WRITING); +} + +static void write_dirty_buffers_async(struct dm_bufio_client *c) +{ + struct dm_buffer *b; + list_for_each_entry_reverse(b, &c->dirty_lru, lru_list) { + cond_resched(); + BUG_ON(test_bit(B_READING, &b->state)); + write_dirty_buffer(b); + } +} + +int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) +{ + struct dm_buffer *b; + mutex_lock(&c->lock); + write_dirty_buffers_async(c); + mutex_unlock(&c->lock); + mutex_lock(&c->lock); + list_for_each_entry_reverse(b, &c->dirty_lru, lru_list) { + cond_resched(); + BUG_ON(test_bit(B_READING, &b->state)); + if (test_bit(B_WRITING, &b->state)) { + b->hold_count++; + mutex_unlock(&c->lock); + wait_on_bit(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE); + mutex_lock(&c->lock); + b->hold_count--; + } + } + wake_up(&c->free_buffer_wait); + mutex_unlock(&c->lock); + return xchg(&c->async_write_error, 0); +} +EXPORT_SYMBOL(dm_bufio_write_dirty_buffers); + +void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) +{ + struct dm_bufio_client *c = b->c; + mutex_lock(&c->lock); + BUG_ON(!b->hold_count); + BUG_ON(test_bit(B_READING, &b->state)); + write_dirty_buffer(b); + if (b->hold_count == 1) { + wait_on_bit(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE); + set_bit(B_DIRTY, &b->state); + unlink_buffer(b); + link_buffer(b, new_block, 1); + } else { + wait_on_bit_lock(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE); + dm_bufio_setup_bio(b, new_block, write_endio); + submit_bio(WRITE, &b->bio); + wait_on_bit(&b->state, B_WRITING, just_io_schedule, TASK_UNINTERRUPTIBLE); + } + mutex_unlock(&c->lock); + dm_bufio_release(b); +} +EXPORT_SYMBOL(dm_bufio_release_move); + +void dm_bufio_drop_buffers(struct dm_bufio_client *c) +{ + struct dm_buffer *b; + mutex_lock(&c->lock); + write_dirty_buffers_async(c); + while ((b = get_unclaimed_buffer(c, 1))) + free_buffer_wake(b); + BUG_ON(!list_empty(&c->lru)); + BUG_ON(!list_empty(&c->dirty_lru)); + mutex_unlock(&c->lock); +} +EXPORT_SYMBOL(dm_bufio_drop_buffers); + +struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size) +{ + int r; + struct dm_bufio_client *c; + unsigned i; + + BUG_ON(!block_size || (block_size & (block_size - 1))); + + c = kmalloc(sizeof(*c), GFP_KERNEL); + if (!c) { + r = -ENOMEM; + goto bad_client; + } + + c->bdev = bdev; + c->block_size = block_size; + c->block_to_sector_shift = ffs(block_size) - 1 - SECTOR_SHIFT; + INIT_LIST_HEAD(&c->lru); + INIT_LIST_HEAD(&c->dirty_lru); + for (i = 0; i < DM_BUFIO_HASH_SIZE; i++) + INIT_HLIST_HEAD(&c->cache_hash[i]); + mutex_init(&c->lock); + c->n_buffers = 0; + c->threshold_buffers = THRESHOLD_MEMORY / c->block_size + 1; + c->limit_buffers = (THRESHOLD_MEMORY + LIMIT_MEMORY) / c->block_size + 1; + init_waitqueue_head(&c->free_buffer_wait); + c->async_write_error = 0; + + c->reserved_buffer = alloc_buffer(c, GFP_KERNEL); + if (!c->reserved_buffer) { + r = -ENOMEM; + goto bad_buffer; + } + + return c; + +bad_buffer: + kfree(c); +bad_client: + return ERR_PTR(r); +} +EXPORT_SYMBOL(dm_bufio_client_create); + +void dm_bufio_client_destroy(struct dm_bufio_client *c) +{ + unsigned i; + dm_bufio_drop_buffers(c); + for (i = 0; i < DM_BUFIO_HASH_SIZE; i++) + BUG_ON(!hlist_empty(&c->cache_hash[i])); + BUG_ON(!c->reserved_buffer); + free_buffer(c->reserved_buffer); + BUG_ON(c->n_buffers != 0); + kfree(c); +} +EXPORT_SYMBOL(dm_bufio_client_destroy); Index: linux-2.6.29-rc1-devel/drivers/md/Makefile =================================================================== --- linux-2.6.29-rc1-devel.orig/drivers/md/Makefile 2009-01-14 02:39:52.000000000 +0100 +++ linux-2.6.29-rc1-devel/drivers/md/Makefile 2009-01-15 18:26:49.000000000 +0100 @@ -3,7 +3,7 @@ # dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ - dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o + dm-ioctl.o dm-io.o dm-kcopyd.o dm-bufio.o dm-sysfs.o dm-multipath-objs := dm-path-selector.o dm-mpath.o dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o Index: linux-2.6.29-rc1-devel/include/linux/dm-bufio.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.29-rc1-devel/include/linux/dm-bufio.h 2009-01-15 18:26:49.000000000 +0100 @@ -0,0 +1,20 @@ +#ifndef _LINUX_DM_BUFIO_H +#define _LINUX_DM_BUFIO_H + +struct dm_bufio_client; +struct dm_buffer; + +void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp); +void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp); +void dm_bufio_release(struct dm_buffer *b); + +void dm_bufio_mark_buffer_dirty(struct dm_buffer *b); +int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c); + +void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); + +struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size); +void dm_bufio_client_destroy(struct dm_bufio_client *c); +void dm_bufio_drop_buffers(struct dm_bufio_client *c); + +#endif