This patch introduces a cluster locking module for device-mapper (and other) applications. It provides nothing that you can't do with the DLM (linux/fs/dlm). It does try to provide a simpler interface and expose a couple of the more powerful features of the DLM in a simple way. One of the nice features of this interface is that it will tell you (return '1') when aquiring a lock if another machine has grabbed the lock in the exclusive mode. This allows the user to know if the resource protected by the lock has been altered (which may require a cache invalidation, or other action). This feature is built on top of the DLM "lock value blocks" (LVBs). Whenever an exclusive lock is aquired, the (64-bit value in) the LVB is incremented. The last known value and the LVB value when the lock is aquired are compared. This interface also allows for blocking or non-blocking lock calls - determined by whether or not the user provided a callback function. The DLM only provides asynchornous locking calls, so this module implements the waiting structures for the user if they wish to use the calls in a blocking fashion. Signed-off-by: Jonathan Brassow --- drivers/md/Kconfig | 9 drivers/md/Makefile | 1 drivers/md/dm-cluster-locking.c | 370 ++++++++++++++++++++++++++++++++++++++++ drivers/md/dm-cluster-locking.h | 122 +++++++++++++ 4 files changed, 502 insertions(+) Index: linux-2.6.31-fast-new-2/drivers/md/dm-cluster-locking.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.31-fast-new-2/drivers/md/dm-cluster-locking.c 2009-10-16 21:52:51.000000000 +0200 @@ -0,0 +1,370 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ +#include +#include +#include +#include +#include +#include + +#include "dm-cluster-locking.h" + +#define DM_MSG_PREFIX "dm-cluster-locking" + +struct dm_cluster_lock { + struct list_head list; + + char *name; + uint32_t name_index; + + struct mutex lock; + + enum dm_cluster_lock_mode mode; + enum dm_cluster_lock_mode old_mode; + + struct dlm_lksb lksb; + struct completion dlm_completion; + + uint64_t counter; + uint64_t cluster_counter; + + void (*callback)(void *data, int rtn); + void *callback_data; +}; + +struct dm_lockspace_instance { + struct list_head list; + + char *uuid; + uint32_t uuid_index; + + struct list_head lock_list; + spinlock_t lock_list_lock; + dlm_lockspace_t *lockspace; + + mempool_t *lock_pool; +}; + +static LIST_HEAD(instance_list_head); +static DEFINE_SPINLOCK(instance_list_lock); + +void *dm_cluster_lock_init(char *uuid) +{ + int len, r; + struct dm_lockspace_instance *dli, *tmp; + + dli = kzalloc(sizeof(*dli), GFP_KERNEL); + if (!dli) + return ERR_PTR(-ENOMEM); + + len = strlen(uuid) + 1; + dli->uuid = kzalloc(len, GFP_KERNEL); + if (!dli->uuid) { + kfree(dli); + return ERR_PTR(-ENOMEM); + } + strcpy(dli->uuid, uuid); + + /* + * We allow 'uuid' to be any length the user wants, but + * with the DLM, we can only create a lockspace with a + * name that is DLM_RESNAME_MAXLEN in size. So, we will + * use the last DLM_RESNAME_MAXLEN characters given as the + * lockspace name and check for conflicts. + */ + dli->uuid_index = (len > DLM_RESNAME_MAXLEN) ? + len - DLM_RESNAME_MAXLEN : 0; + + spin_lock(&instance_list_lock); + list_for_each_entry(tmp, &instance_list_head, list) + if (!strcmp(tmp->uuid + tmp->uuid_index, + dli->uuid + dli->uuid_index)) { + kfree(dli->uuid); + kfree(dli); + return ERR_PTR(-EBUSY); + } + list_add(&dli->list, &instance_list_head); + spin_unlock(&instance_list_lock); + + INIT_LIST_HEAD(&dli->lock_list); + spin_lock_init(&dli->lock_list_lock); + r = dlm_new_lockspace(dli->uuid + dli->uuid_index, + strlen(dli->uuid + dli->uuid_index), + &dli->lockspace, 0, sizeof(uint64_t)); + if (r) { + DMERR("Failed to create lockspace: %s", uuid); + spin_lock(&instance_list_lock); + list_del(&dli->list); + spin_unlock(&instance_list_lock); + kfree(dli->uuid); + kfree(dli); + return ERR_PTR(r); + } + + return (void *)dli; +} +EXPORT_SYMBOL(dm_cluster_lock_init); + +void dm_cluster_lock_exit(void *h) +{ + struct dm_lockspace_instance *dli = h; + + spin_lock(&instance_list_lock); + list_del(&dli->list); + spin_unlock(&instance_list_lock); + + dlm_release_lockspace(dli->lockspace, 1); + kfree(dli->uuid); + kfree(dli); +} +EXPORT_SYMBOL(dm_cluster_lock_exit); + +static struct dm_cluster_lock *__lookup_dcl(struct dm_lockspace_instance *dli, + const char *lock_name) +{ + uint32_t index; + struct dm_cluster_lock *dcl; + + index = strlen(lock_name); + index = (index > DLM_RESNAME_MAXLEN) ? index - DLM_RESNAME_MAXLEN : 0; + + list_for_each_entry(dcl, &dli->lock_list, list) + if (!strcmp(dcl->name + dcl->name_index, + lock_name + index)) + return dcl; + + return NULL; +} + +static struct dm_cluster_lock *allocate_dcl(struct dm_lockspace_instance *dli, + const char *lock_name) +{ + size_t len = strlen(lock_name); + struct dm_cluster_lock *new; + + new = kzalloc(sizeof(*new), GFP_NOIO); + if (!new) + return NULL; + + new->name = kzalloc(len + 1, GFP_NOIO); + if (!new->name) { + kfree(new); + return NULL; + } + + mutex_init(&new->lock); + strcpy(new->name, lock_name); + new->name_index = (len > DLM_RESNAME_MAXLEN) ? + len - DLM_RESNAME_MAXLEN : 0; + + new->mode = DM_CLUSTER_LOCK_UNLOCK; + new->old_mode = DM_CLUSTER_LOCK_UNLOCK; + init_completion(&new->dlm_completion); + new->counter = (uint64_t)-1; + new->lksb.sb_lvbptr = (char *)&new->cluster_counter; + + return new; +} + +void free_dcl(struct dm_cluster_lock *dcl) +{ + BUG_ON(dcl->mode != DM_CLUSTER_LOCK_UNLOCK); + + kfree(dcl->name); + kfree(dcl); +} + +static struct dm_cluster_lock *find_dcl(struct dm_lockspace_instance *dli, + const char *lock_name) +{ + struct dm_cluster_lock *dcl, *new; + + spin_lock(&dli->lock_list_lock); + dcl = __lookup_dcl(dli, lock_name); + spin_unlock(&dli->lock_list_lock); + + if (dcl) + return dcl; + + new = allocate_dcl(dli, lock_name); + if (!new) + return NULL; + + spin_lock(&dli->lock_list_lock); + dcl = __lookup_dcl(dli, lock_name); + if (!dcl) { + dcl = new; + list_add(&dcl->list, &dli->lock_list); + } else + free_dcl(new); + spin_unlock(&dli->lock_list_lock); + + return dcl; +} + +static int lock_return_value(struct dm_cluster_lock *dcl) +{ + int r = 0; + uint64_t old = dcl->counter; + + if (dcl->lksb.sb_status) { + DMERR("Error aquiring lock for %s: %d", + dcl->name, dcl->lksb.sb_status); + + /* Revert to previous state */ + dcl->mode = dcl->old_mode; + + return dcl->lksb.sb_status; + } + + dcl->counter = dcl->cluster_counter; + + /* + * If the counters differ /and/ we are aquiring + * the lock in a mode where we care to find out + * if they are different, then return 1 + */ + if ((dcl->mode > DM_CLUSTER_LOCK_MONITOR) && + ((old == (uint64_t)-1) || (old != dcl->cluster_counter))) + r = 1; + + /* + * Lock has been safely aquired, we can now + * stop tracking the old mode and update it for + * its next change. + */ + dcl->old_mode = dcl->mode; + + return r; +} + +static void lock_obtained(void *context) +{ + struct dm_cluster_lock *dcl = context; + + if (!dcl->callback) + complete(&dcl->dlm_completion); + else + dcl->callback(dcl->callback_data, lock_return_value(dcl)); +} + +int dm_cluster_lock_by_str(void *h, const char *lock_name, + enum dm_cluster_lock_mode mode, + void (*callback)(void *data, int rtn), void *data) +{ + int r; + int dlm_mode = DLM_LOCK_IV; + uint32_t flags = DLM_LKF_VALBLK; + struct dm_cluster_lock *dcl; + struct dm_lockspace_instance *dli = h; + + dcl = find_dcl(dli, lock_name); + if (!dcl) + return -ENOMEM; + + mutex_lock(&dcl->lock); + /* Do we already have the lock in this mode? */ + if (mode == dcl->mode) { + mutex_unlock(&dcl->lock); + return 0; + } + + /* + * Allowing SHARED <-> EXCLUSIVE conversions would allow + * the user to shoot themselves in the foot with deadlock + * issues. If they want to do that, then they can use the + * DLM directly. Otherwise, they will have to move to a + * lower state then back up. + */ + if ((mode > DM_CLUSTER_LOCK_MONITOR) && + (dcl->mode > DM_CLUSTER_LOCK_MONITOR)) { + DMERR("Illegal locking conversion issued."); + mutex_unlock(&dcl->lock); + return -EINVAL; + } + + dcl->callback = callback; + dcl->callback_data = data; + + if (mode != DM_CLUSTER_LOCK_UNLOCK) { + /* Is it a new lock, or can we just convert? */ + if (dcl->mode != DM_CLUSTER_LOCK_UNLOCK) + flags |= DLM_LKF_CONVERT; + + switch (mode) { + case DM_CLUSTER_LOCK_MONITOR: + dlm_mode = DLM_LOCK_NL; + break; + case DM_CLUSTER_LOCK_SHARED: + dlm_mode = DLM_LOCK_CR; + break; + case DM_CLUSTER_LOCK_EXCLUSIVE: + dlm_mode = DLM_LOCK_EX; + break; + default: + mutex_unlock(&dcl->lock); + return -EINVAL; + } + + r = dlm_lock(dli->lockspace, dlm_mode, &dcl->lksb, + flags, dcl->name + dcl->name_index, + strlen(dcl->name + dcl->name_index), 0, + lock_obtained, dcl, NULL); + if (r) { + DMERR("Failed to issue DLM lock operation: %d", r); + mutex_unlock(&dcl->lock); + return r; + } + + dcl->mode = mode; + } else { + r = dlm_unlock(dli->lockspace, dcl->lksb.sb_lkid, + DLM_LKF_FORCEUNLOCK, NULL, NULL); + if (r) { + mutex_unlock(&dcl->lock); + return r; + } + + free_dcl(dcl); + dcl = NULL; + } + + if (dcl && !dcl->callback) { + wait_for_completion(&dcl->dlm_completion); + r = lock_return_value(dcl); + } + + mutex_unlock(&dcl->lock); + return r; +} +EXPORT_SYMBOL(dm_cluster_lock_by_str); + +int dm_cluster_lock(void *h, uint64_t lock_nr, enum dm_cluster_lock_mode mode, + void (*callback)(void *data, int rtn), void *data) +{ + char lock_str[32]; /* Enough to hold printed 64-bit number */ + + sprintf(lock_str, "%llu", lock_nr); + + return dm_cluster_lock_by_str(h, lock_str, mode, callback, data); +} +EXPORT_SYMBOL(dm_cluster_lock); + +static int __init dm_cluster_lock_module_init(void) +{ + return 0; +} + +static void __exit dm_cluster_lock_module_exit(void) +{ +} + +module_init(dm_cluster_lock_module_init); +module_exit(dm_cluster_lock_module_exit); + +MODULE_DESCRIPTION("DM Cluster Locking module"); +MODULE_AUTHOR("Jonathan Brassow"); +MODULE_LICENSE("GPL"); Index: linux-2.6.31-fast-new-2/drivers/md/Kconfig =================================================================== --- linux-2.6.31-fast-new-2.orig/drivers/md/Kconfig 2009-10-16 20:54:50.000000000 +0200 +++ linux-2.6.31-fast-new-2/drivers/md/Kconfig 2009-10-16 21:52:51.000000000 +0200 @@ -349,4 +349,13 @@ config DM_IOBAND If unsure, say N. +config DM_CLUSTER_LOCKING + tristate "DM Cluster Locking module (EXPERIMENTAL)" + select DLM + ---help--- + The DM Cluster Locking module provides a simple set of + cluster locking commands. It is a wrapper around the + more versatile (but more complex) DLM - which is also + found in the kernel. + endif # MD Index: linux-2.6.31-fast-new-2/drivers/md/Makefile =================================================================== --- linux-2.6.31-fast-new-2.orig/drivers/md/Makefile 2009-10-16 20:54:50.000000000 +0200 +++ linux-2.6.31-fast-new-2/drivers/md/Makefile 2009-10-16 21:52:51.000000000 +0200 @@ -57,6 +57,7 @@ obj-$(CONFIG_DM_MULTISNAPSHOT_FUJITA_DAN obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o +obj-$(CONFIG_DM_CLUSTER_LOCKING) += dm-cluster-locking.o quiet_cmd_unroll = UNROLL $@ cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \ Index: linux-2.6.31-fast-new-2/drivers/md/dm-cluster-locking.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.31-fast-new-2/drivers/md/dm-cluster-locking.h 2009-10-16 21:52:51.000000000 +0200 @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ +#ifndef __DM_CLUSTER_LOCKING_DOT_H__ +#define __DM_CLUSTER_LOCKING_DOT_H__ + +/* + * The 'dm_cluster_lock_*' functions are used to provide a simplified + * interface to the DLM (linux/fs/dlm). + * + * Simplified usage example (See function definitions for more info): + * h = dm_cluster_lock_init(UUID); + * dm_cluster_lock(h, 1234, DM_CLUSTER_LOCK_EXCLUSIVE, NULL, NULL); + * dm_cluster_lock(h, 1234, DM_CLUSTER_LOCK_UNLOCK, NULL, NULL); + * dm_cluster_lock_exit(h); + * + * The real benefit of this locking interface is the DM_CLUSTER_LOCK_MONITOR + * action. When a lock is aquired in this mode, you will be able to tell + * if another machine aquired the lock in the DM_CLUSTER_LOCK_EXCLUSIVE state + * when you attempt to aquire the lock shared or exclusive. In other words, + * when you aquire the lock, you will be able to see if the resource being + * protected by the lock has changed. This interface also allows the user + * to choose between blocking and non-blocking lock calls. + * + * Two example solutions might be... + * 1) Creating cluster-aware device-mapper snapshots + * - A lockspace is created with a UUID of "cluster-aware-exception-store". + * - Locks are aquired by a UUID representing the snapshot targets. + * - Exclusive locks are taken when adding exceptions + * - Shared locks are taken when looking-up exceptions + * A machine knows if it needs to look on disk for new exceptions that + * may have been added if the dm_cluster_lock call returns '1' when + * converting a lock from DM_CLUSTER_LOCK_MONITOR to + * DM_CLUSTER_LOCK_SHARED/EXCLUSIVE. This is an ideal usage of this + * interface. The lock traffic is low and few resources are used to + * add some good benefit. + * + * 2) RAID-X implementation with block level locking. + * - A lockspace is created for /each/ target (device-mapper device) + * - Locks are aquired (by number) on portions of the disk as they are used + * We can still use the return value from the locking call to determine + * if any stripe caching needs to be invalidated and re-read. This is not + * an ideal usage of this interface due to the amount of traffic/resources + * that may be required. This can be optimized in the future by providing + * hashing, caching, and preallocation of resources in this module. Careful + * coding of the RAID module to minimize lock calls can also help. + * + * The difference in the examples above is that the first creates a lockspace + * with the name of module, while taking out locks on the whole device. The + * second creates multiple lockspaces - one for each device - and grabs locks + * based on block, chunk, or other portion of the disk. + */ + +enum dm_cluster_lock_mode { + DM_CLUSTER_LOCK_UNLOCK, + + /* + * DM_CLUSTER_LOCK_MONITOR + * + * Aquire the lock in this mode to monitor if another machine + * aquires this lock in the DM_CLUSTER_LOCK_EXCLUSIVE mode. Later, + * when aquiring the lock in DM_CLUSTER_LOCK_EXCLUSIVE or + * DM_CLUSTER_LOCK_SHARED mode, dm_cluster_lock will return '1' if + * the lock had been aquired DM_CLUSTER_LOCK_EXCLUSIVE. + * + * This is useful because it gives the programmer a way of knowing if + * they need to perform an operation (invalidate cache, read additional + * metadata, etc) after aquiring the cluster lock. + */ + DM_CLUSTER_LOCK_MONITOR, + + DM_CLUSTER_LOCK_SHARED, + + DM_CLUSTER_LOCK_EXCLUSIVE, +}; + +/** + * dm_cluster_lock_init + * @uuid: The name given to this lockspace + * + * Returns: handle pointer on success, ERR_PTR(-EXXX) on failure + **/ +void *dm_cluster_lock_init(char *uuid); + +/** + * dm_cluster_lock_exit + * @h: The handle returned from dm_cluster_lock_init + */ +void dm_cluster_lock_exit(void *h); + +/** + * dm_cluster_lock + * @h : The handle returned from 'dm_cluster_lock_init' + * @lock_nr: The lock number + * @mode : One of DM_CLUSTER_LOCK_* (how to hold the lock) + * @callback: If provided, function will be non-blocking and use this + * to notify caller when the lock is aquired. If not provided, + * this function will block until the lock is aquired. + * @callback_data: User context data that will be provided via the callback fn. + * + * Returns: -EXXX on error or 0 on success for DM_CLUSTER_LOCK_* + * 1 is a possible return if EXCLUSIVE/SHARED is the lock action, + * the lock operation is successful, and an exlusive lock was aquired + * by another machine while the lock was held in the + * DM_CLUSTERED_LOCK_MONITOR state. + **/ +int dm_cluster_lock(void *h, uint64_t lock_nr, enum dm_cluster_lock_mode mode, + void (*callback)(void *data, int rtn), void *data); + +/* + * dm_cluster_lock_by_name + * @lock_name: The lock name (up to 128 characters) + * + * Otherwise, the same as 'dm_cluster_lock' + */ +int dm_cluster_lock_by_str(void *h, const char *lock_name, + enum dm_cluster_lock_mode mode, + void (*callback)(void *data, int rtn), void *data); + +#endif /* __DM_CLUSTER_LOCKING_DOT_H__ */