dm optimization: make io_lock percpu This patch optimizes io_lock. io_lock is read-write semaphore, it is taken for read on the common I/O path and it is taken for write on the uncommon suspend path. Each thread taking or releasing io_lock for read performs an atomic operation on the memory area where the lock is placed. On SMP machines these atomic operations can cause cache line bouncing between processors. In order to avoid cache line bouncing, io_lock is changed to an array of read-write semaphores, each semaphore for one CPU. Read lock is performed only on the semaphore belonging to the appropriate CPU, write lock is performed on all the semaphores. This maintains rw-semaphore semantics but it eliminates cache line bouncing when there are many concurrent readers. Performance of write-lock is degraded (we have to lock as many locks as there are processors in the machine), but it doesn't matter because write lock is only taken in case of suspend or resume. Note that the patch defines the functions to manage percpu using a macro declare_percpu_rw_lock. The reason for using a macro is that similar definitions will be introduced in the next patch that defined per-cpu rw spinlock. In order to use the same code for percpu-rw-semaphores and percpu-rw-spinlocks, the definition is done with a macro. Signed-off-by: Mikulas Patocka --- drivers/md/dm.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 135 insertions(+), 13 deletions(-) Index: linux-3.3-fast/drivers/md/dm.c =================================================================== --- linux-3.3-fast.orig/drivers/md/dm.c 2012-04-21 03:19:50.000000000 +0200 +++ linux-3.3-fast/drivers/md/dm.c 2012-04-21 03:21:56.000000000 +0200 @@ -34,6 +34,120 @@ DEFINE_RATELIMIT_STATE(dm_ratelimit_stat EXPORT_SYMBOL(dm_ratelimit_state); #endif +#ifndef CONFIG_SMP + +#define declare_percpu_rw_lock(name, struc, ini, dn_rd, up_rd, dn_wr, up_wr)\ + \ +struct percpu_##name { \ + struc sem; \ +}; \ + \ +static void free_percpu_##name(struct percpu_##name *sem) \ +{ \ +} \ + \ +static int init_percpu_##name(struct percpu_##name *sem) \ +{ \ + ini(&sem->sem); \ + return 0; \ +} \ + \ +static int down_read_percpu_##name(struct percpu_##name *sem) \ +{ \ + dn_rd(&sem->sem); \ + return 0; \ +} \ + \ +static void up_read_percpu_##name(struct percpu_##name *sem, int lock_cpu)\ +{ \ + up_rd(&sem->sem); \ +} \ + \ +static void down_write_percpu_##name(struct percpu_##name *sem) \ +{ \ + dn_wr(&sem->sem, 0); \ +} \ + \ +static void up_write_percpu_##name(struct percpu_##name *sem) \ +{ \ + up_wr(&sem->sem); \ +} \ + +#else + +#define declare_percpu_rw_lock(name, struc, ini, dn_rd, up_rd, dn_wr, up_wr)\ + \ +struct percpu_##name { \ + struc **sem; \ +}; \ + \ +static void free_percpu_##name(struct percpu_##name *sem) \ +{ \ + int i; \ + for (i = 0; i < nr_cpu_ids; i++) \ + if (sem->sem[i]) \ + kfree(sem->sem[i]); \ + kfree(sem->sem); \ +} \ + \ +static int init_percpu_##name(struct percpu_##name *sem) \ +{ \ + size_t size = max(sizeof(struc), (size_t)L1_CACHE_BYTES); \ + int i; \ + sem->sem = kzalloc(nr_cpu_ids * sizeof(struc *), GFP_KERNEL); \ + if (!sem->sem) \ + return -ENOMEM; \ + for (i = 0; i < nr_cpu_ids; i++) \ + if (cpu_possible(i)) { \ + sem->sem[i] = kmalloc_node(size, GFP_KERNEL, cpu_to_node(i));\ + if (!sem->sem[i]) { \ + free_percpu_##name(sem); \ + return -ENOMEM; \ + } \ + ini(sem->sem[i]); \ + } \ + return 0; \ +} \ + \ +static int down_read_percpu_##name(struct percpu_##name *sem) \ +{ \ + /* \ + * Use raw_smp_processor_id() instead of smp_processor_id() to \ + * suppress warning message about using it with preempt enabled.\ + * Preempt doesn't really matter here, locking the lock on \ + * a different CPU will cause a small performance impact but \ + * no race condition. \ + */ \ + int i = raw_smp_processor_id(); \ + dn_rd(sem->sem[i]); \ + return i; \ +} \ + \ +static void up_read_percpu_##name(struct percpu_##name *sem, int lock_cpu)\ +{ \ + up_rd(sem->sem[lock_cpu]); \ +} \ + \ +static void down_write_percpu_##name(struct percpu_##name *sem) \ +{ \ + int i; \ + for (i = 0; i < nr_cpu_ids; i++) \ + if (sem->sem[i]) \ + dn_wr(sem->sem[i], i); \ +} \ + \ +static void up_write_percpu_##name(struct percpu_##name *sem) \ +{ \ + int i; \ + for (i = 0; i < nr_cpu_ids; i++) \ + if (sem->sem[i]) \ + up_wr(sem->sem[i]); \ +} \ + +#endif + +declare_percpu_rw_lock(rw_semaphore, struct rw_semaphore, init_rwsem, down_read, up_read, down_write_nested, up_write) + /* * Cookies are numeric values sent with CHANGE and REMOVE * uevents while resuming, removing or renaming the device. @@ -134,7 +248,7 @@ struct dm_table { * Work processed by per-device workqueue. */ struct mapped_device { - struct rw_semaphore io_lock; + struct percpu_rw_semaphore io_lock; struct mutex suspend_lock; rwlock_t map_lock; atomic_t holders; @@ -1412,8 +1526,9 @@ static void _dm_request(struct request_q int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; int cpu; + int lock_cpu; - down_read(&md->io_lock); + lock_cpu = down_read_percpu_rw_semaphore(&md->io_lock); cpu = part_stat_lock(); part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); @@ -1422,7 +1537,7 @@ static void _dm_request(struct request_q /* if we're suspended, we have to queue this io for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { - up_read(&md->io_lock); + up_read_percpu_rw_semaphore(&md->io_lock, lock_cpu); if (bio_rw(bio) != READA) queue_io(md, bio); @@ -1432,7 +1547,7 @@ static void _dm_request(struct request_q } __split_and_process_bio(md, bio); - up_read(&md->io_lock); + up_read_percpu_rw_semaphore(&md->io_lock, lock_cpu); return; } @@ -1845,8 +1960,11 @@ static struct mapped_device *alloc_dev(i if (r < 0) goto bad_minor; + r = init_percpu_rw_semaphore(&md->io_lock); + if (r < 0) + goto bad_semaphore; + md->type = DM_TYPE_NONE; - init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); spin_lock_init(&md->deferred_lock); @@ -1912,6 +2030,8 @@ bad_thread: bad_disk: blk_cleanup_queue(md->queue); bad_queue: + free_percpu_rw_semaphore(&md->io_lock); +bad_semaphore: free_minor(minor); bad_minor: module_put(THIS_MODULE); @@ -1937,6 +2057,7 @@ static void free_dev(struct mapped_devic bioset_free(md->bs); blk_integrity_unregister(md->disk); del_gendisk(md->disk); + free_percpu_rw_semaphore(&md->io_lock); free_minor(minor); spin_lock(&_minor_lock); @@ -2356,8 +2477,9 @@ static void dm_wq_work(struct work_struc struct mapped_device *md = container_of(work, struct mapped_device, work); struct bio *c; + int lock_cpu; - down_read(&md->io_lock); + lock_cpu = down_read_percpu_rw_semaphore(&md->io_lock); while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { spin_lock_irq(&md->deferred_lock); @@ -2367,17 +2489,17 @@ static void dm_wq_work(struct work_struc if (!c) break; - up_read(&md->io_lock); + up_read_percpu_rw_semaphore(&md->io_lock, lock_cpu); if (dm_request_based(md)) generic_make_request(c); else __split_and_process_bio(md, c); - down_read(&md->io_lock); + lock_cpu = down_read_percpu_rw_semaphore(&md->io_lock); } - up_read(&md->io_lock); + up_read_percpu_rw_semaphore(&md->io_lock, lock_cpu); } static void dm_queue_flush(struct mapped_device *md) @@ -2513,9 +2635,9 @@ int dm_suspend(struct mapped_device *md, * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call * flush_workqueue(md->wq). */ - down_write(&md->io_lock); + down_write_percpu_rw_semaphore(&md->io_lock); set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - up_write(&md->io_lock); + up_write_percpu_rw_semaphore(&md->io_lock); /* * Stop md->queue before flushing md->wq in case request-based @@ -2533,10 +2655,10 @@ int dm_suspend(struct mapped_device *md, */ r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); - down_write(&md->io_lock); + down_write_percpu_rw_semaphore(&md->io_lock); if (noflush) clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - up_write(&md->io_lock); + up_write_percpu_rw_semaphore(&md->io_lock); /* were we interrupted ? */ if (r < 0) {