Refine use of zv_state_lock.

Use zv_state_lock to protect all members of zvol_state structure, add
relevant ASSERT()s. Take zv_suspend_lock before zv_state_lock, do not
hold zv_state_lock across suspend/resume.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Boris Protopopov <boris.protopopov@actifio.com>
Closes #6226
This commit is contained in:
Boris Protopopov 2017-06-13 12:03:44 -04:00 committed by Brian Behlendorf
parent 82710e993a
commit 58404a73db

View File

@ -50,7 +50,15 @@
* time comes to remove the structure from the list, it is not in use, and
* therefore, it can be taken off zvol_state_list and freed.
*
* The minor operations are issued to the spa->spa_zvol_taskq quues, that are
* The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
* e.g. for the duration of receive and rollback operations. This lock can be
* held for significant periods of time. Given that it is undesirable to hold
* mutexes for long periods of time, the following lock ordering applies:
* - take zvol_state_lock if necessary, to protect zvol_state_list
* - take zv_suspend_lock if necessary, by the code path in question
* - take zv_state_lock to protect zvol_state_t
*
* The minor operations are issued to spa->spa_zvol_taskq queues, that are
* single-threaded (to preserve order of minor operations), and are executed
* through the zvol_task_cb that dispatches the specific operations. Therefore,
* these operations are serialized per pool. Consequently, we can be certain
@ -156,49 +164,87 @@ zvol_name_hash(const char *name)
}
/*
* Find a zvol_state_t given the full major+minor dev_t.
* Find a zvol_state_t given the full major+minor dev_t. If found,
* return with zv_state_lock taken, otherwise, return (NULL) without
* taking zv_state_lock.
*/
static zvol_state_t *
zvol_find_by_dev(dev_t dev)
{
zvol_state_t *zv;
ASSERT(MUTEX_HELD(&zvol_state_lock));
mutex_enter(&zvol_state_lock);
for (zv = list_head(&zvol_state_list); zv != NULL;
zv = list_next(&zvol_state_list, zv)) {
if (zv->zv_dev == dev)
mutex_enter(&zv->zv_state_lock);
if (zv->zv_dev == dev) {
mutex_exit(&zvol_state_lock);
return (zv);
}
mutex_exit(&zv->zv_state_lock);
}
mutex_exit(&zvol_state_lock);
return (NULL);
}
/*
* Find a zvol_state_t given the name and hash generated by zvol_name_hash.
* If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
* return (NULL) without the taking locks. The zv_suspend_lock is always taken
* before zv_state_lock. The mode argument indicates the mode (including none)
* for zv_suspend_lock to be taken.
*/
static zvol_state_t *
zvol_find_by_name_hash(const char *name, uint64_t hash)
zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
{
zvol_state_t *zv;
struct hlist_node *p;
ASSERT(MUTEX_HELD(&zvol_state_lock));
mutex_enter(&zvol_state_lock);
hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
zv = hlist_entry(p, zvol_state_t, zv_hlink);
mutex_enter(&zv->zv_state_lock);
if (zv->zv_hash == hash &&
strncmp(zv->zv_name, name, MAXNAMELEN) == 0)
strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
/*
* this is the right zvol, take the locks in the
* right order
*/
if (mode != RW_NONE &&
!rw_tryenter(&zv->zv_suspend_lock, mode)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, mode);
mutex_enter(&zv->zv_state_lock);
/*
* zvol cannot be renamed as we continue
* to hold zvol_state_lock
*/
ASSERT(zv->zv_hash == hash &&
strncmp(zv->zv_name, name, MAXNAMELEN)
== 0);
}
mutex_exit(&zvol_state_lock);
return (zv);
}
mutex_exit(&zv->zv_state_lock);
}
mutex_exit(&zvol_state_lock);
return (NULL);
}
/*
* Find a zvol_state_t given the name provided at zvol_alloc() time.
* Find a zvol_state_t given the name.
* If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
* return (NULL) without the taking locks. The zv_suspend_lock is always taken
* before zv_state_lock. The mode argument indicates the mode (including none)
* for zv_suspend_lock to be taken.
*/
static zvol_state_t *
zvol_find_by_name(const char *name)
zvol_find_by_name(const char *name, int mode)
{
return (zvol_find_by_name_hash(name, zvol_name_hash(name)));
return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
}
@ -295,9 +341,12 @@ zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
{
struct block_device *bdev;
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
bdev = bdget_disk(zv->zv_disk, 0);
if (bdev == NULL)
return;
set_capacity(zv->zv_disk, volsize >> 9);
zv->zv_volsize = volsize;
check_disk_size_change(zv->zv_disk, bdev);
@ -391,13 +440,14 @@ zvol_set_volsize(const char *name, uint64_t volsize)
if (readonly)
return (SET_ERROR(EROFS));
mutex_enter(&zvol_state_lock);
zv = zvol_find_by_name(name);
if (zv != NULL)
mutex_enter(&zv->zv_state_lock);
mutex_exit(&zvol_state_lock);
zv = zvol_find_by_name(name, RW_READER);
ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
RW_READ_HELD(&zv->zv_suspend_lock)));
if (zv == NULL || zv->zv_objset == NULL) {
if (zv != NULL)
rw_exit(&zv->zv_suspend_lock);
if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
FTAG, &os)) != 0) {
if (zv != NULL)
@ -408,7 +458,6 @@ zvol_set_volsize(const char *name, uint64_t volsize)
if (zv != NULL)
zv->zv_objset = os;
} else {
rw_enter(&zv->zv_suspend_lock, RW_READER);
os = zv->zv_objset;
}
@ -435,6 +484,7 @@ out:
if (zv != NULL)
mutex_exit(&zv->zv_state_lock);
return (SET_ERROR(error));
}
@ -485,23 +535,20 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
dmu_tx_t *tx;
int error;
mutex_enter(&zvol_state_lock);
zv = zvol_find_by_name(name, RW_READER);
zv = zvol_find_by_name(name);
if (zv == NULL) {
mutex_exit(&zvol_state_lock);
if (zv == NULL)
return (SET_ERROR(ENXIO));
}
mutex_enter(&zv->zv_state_lock);
mutex_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
RW_READ_HELD(&zv->zv_suspend_lock));
if (zv->zv_flags & ZVOL_RDONLY) {
mutex_exit(&zv->zv_state_lock);
rw_exit(&zv->zv_suspend_lock);
return (SET_ERROR(EROFS));
}
rw_enter(&zv->zv_suspend_lock, RW_READER);
tx = dmu_tx_create(zv->zv_objset);
dmu_tx_hold_bonus(tx, ZVOL_OBJ);
error = dmu_tx_assign(tx, TXG_WAIT);
@ -516,9 +563,9 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
if (error == 0)
zv->zv_volblocksize = volblocksize;
}
rw_exit(&zv->zv_suspend_lock);
mutex_exit(&zv->zv_state_lock);
rw_exit(&zv->zv_suspend_lock);
return (SET_ERROR(error));
}
@ -1060,6 +1107,9 @@ zvol_setup_zv(zvol_state_t *zv)
uint64_t ro;
objset_t *os = zv->zv_objset;
ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
RW_LOCK_HELD(&zv->zv_suspend_lock));
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
if (error)
return (SET_ERROR(error));
@ -1094,6 +1144,9 @@ zvol_setup_zv(zvol_state_t *zv)
static void
zvol_shutdown_zv(zvol_state_t *zv)
{
ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
RW_LOCK_HELD(&zv->zv_suspend_lock));
zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
@ -1127,24 +1180,27 @@ zvol_suspend(const char *name)
{
zvol_state_t *zv;
mutex_enter(&zvol_state_lock);
zv = zvol_find_by_name(name);
if (zv == NULL) {
mutex_exit(&zvol_state_lock);
zv = zvol_find_by_name(name, RW_WRITER);
if (zv == NULL)
return (NULL);
}
mutex_enter(&zv->zv_state_lock);
mutex_exit(&zvol_state_lock);
/* block all I/O, release in zvol_resume. */
rw_enter(&zv->zv_suspend_lock, RW_WRITER);
ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
RW_WRITE_HELD(&zv->zv_suspend_lock));
atomic_inc(&zv->zv_suspend_ref);
if (zv->zv_open_count > 0)
zvol_shutdown_zv(zv);
/*
* do not hold zv_state_lock across suspend/resume to
* avoid locking up zvol lookups
*/
mutex_exit(&zv->zv_state_lock);
/* zv_suspend_lock is released in zvol_resume() */
return (zv);
}
@ -1155,11 +1211,8 @@ zvol_resume(zvol_state_t *zv)
ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
/*
* Cannot take zv_state_lock here with zv_suspend_lock
* held; however, the latter is held in exclusive mode,
* so it is not necessary to do so
*/
mutex_enter(&zv->zv_state_lock);
if (zv->zv_open_count > 0) {
VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
@ -1168,6 +1221,9 @@ zvol_resume(zvol_state_t *zv)
error = zvol_setup_zv(zv);
}
mutex_exit(&zv->zv_state_lock);
rw_exit(&zv->zv_suspend_lock);
/*
* We need this because we don't hold zvol_state_lock while releasing
@ -1186,6 +1242,9 @@ zvol_first_open(zvol_state_t *zv)
objset_t *os;
int error, locked = 0;
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
/*
* In all other cases the spa_namespace_lock is taken before the
* bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
@ -1233,6 +1292,9 @@ out_mutex:
static void
zvol_last_close(zvol_state_t *zv)
{
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
zvol_shutdown_zv(zv);
dmu_objset_disown(zv->zv_objset, zv);
@ -1243,14 +1305,15 @@ static int
zvol_open(struct block_device *bdev, fmode_t flag)
{
zvol_state_t *zv;
int error = 0, drop_suspend = 0;
int error = 0;
boolean_t drop_suspend = B_FALSE;
ASSERT(!mutex_owned(&zvol_state_lock));
mutex_enter(&zvol_state_lock);
/*
* Obtain a copy of private_data under the lock to make sure
* that either the result of zvol free code path setting
* Obtain a copy of private_data under the zvol_state_lock to make
* sure that either the result of zvol free code path setting
* bdev->bd_disk->private_data to NULL is observed, or zvol_free()
* is not called on this zv because of the positive zv_open_count.
*/
@ -1259,14 +1322,25 @@ zvol_open(struct block_device *bdev, fmode_t flag)
mutex_exit(&zvol_state_lock);
return (SET_ERROR(-ENXIO));
}
/* take zv_suspend_lock before zv_state_lock */
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
/*
* make sure zvol is not suspended during first open
* (hold zv_suspend_lock), otherwise, drop the lock
*/
if (zv->zv_open_count == 0) {
drop_suspend = B_TRUE;
} else {
rw_exit(&zv->zv_suspend_lock);
}
mutex_exit(&zvol_state_lock);
if (zv->zv_open_count == 0) {
/* make sure zvol is not suspended when first open */
rw_enter(&zv->zv_suspend_lock, RW_READER);
drop_suspend = 1;
error = zvol_first_open(zv);
if (error)
goto out_mutex;
@ -1285,9 +1359,9 @@ out_open_count:
if (zv->zv_open_count == 0)
zvol_last_close(zv);
out_mutex:
mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
mutex_exit(&zv->zv_state_lock);
return (SET_ERROR(error));
}
@ -1300,27 +1374,38 @@ static int
zvol_release(struct gendisk *disk, fmode_t mode)
{
zvol_state_t *zv;
boolean_t drop_suspend = B_FALSE;
ASSERT(!mutex_owned(&zvol_state_lock));
mutex_enter(&zvol_state_lock);
zv = disk->private_data;
ASSERT(zv && zv->zv_open_count > 0);
/* take zv_suspend_lock before zv_state_lock */
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
mutex_exit(&zvol_state_lock);
/* make sure zvol is not suspended when last close */
/*
* make sure zvol is not suspended during last close
* (hold zv_suspend_lock), otherwise, drop the lock
*/
if (zv->zv_open_count == 1)
rw_enter(&zv->zv_suspend_lock, RW_READER);
drop_suspend = B_TRUE;
else
rw_exit(&zv->zv_suspend_lock);
zv->zv_open_count--;
if (zv->zv_open_count == 0) {
if (zv->zv_open_count == 0)
zvol_last_close(zv);
rw_exit(&zv->zv_suspend_lock);
}
mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
#ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
return (0);
#endif
@ -1430,10 +1515,11 @@ zvol_probe(dev_t dev, int *part, void *arg)
zvol_state_t *zv;
struct kobject *kobj;
mutex_enter(&zvol_state_lock);
zv = zvol_find_by_dev(dev);
kobj = zv ? get_disk(zv->zv_disk) : NULL;
mutex_exit(&zvol_state_lock);
ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
if (zv)
mutex_exit(&zv->zv_state_lock);
return (kobj);
}
@ -1571,6 +1657,8 @@ zvol_free(void *arg)
zvol_state_t *zv = arg;
ASSERT(!MUTEX_HELD(&zvol_state_lock));
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
ASSERT(zv->zv_open_count == 0);
ASSERT(zv->zv_disk->private_data == NULL);
@ -1611,17 +1699,14 @@ zvol_create_minor_impl(const char *name)
return (SET_ERROR(-idx));
minor = idx << ZVOL_MINOR_BITS;
mutex_enter(&zvol_state_lock);
zv = zvol_find_by_name_hash(name, hash);
zv = zvol_find_by_name_hash(name, hash, RW_NONE);
if (zv) {
mutex_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
mutex_exit(&zv->zv_state_lock);
ida_simple_remove(&zvol_ida, idx);
return (SET_ERROR(EEXIST));
}
mutex_exit(&zvol_state_lock);
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
@ -1715,10 +1800,9 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
int readonly = get_disk_ro(zv->zv_disk);
ASSERT(MUTEX_HELD(&zvol_state_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
rw_enter(&zv->zv_suspend_lock, RW_READER);
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
rw_exit(&zv->zv_suspend_lock);
/* move to new hashtable entry */
zv->zv_hash = zvol_name_hash(zv->zv_name);
@ -1960,15 +2044,15 @@ zvol_remove_minors_impl(const char *name)
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
mutex_enter(&zv->zv_state_lock);
if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
(strncmp(zv->zv_name, name, namelen) == 0 &&
(zv->zv_name[namelen] == '/' ||
zv->zv_name[namelen] == '@'))) {
/*
* By taking zv_state_lock here, we guarantee that no
* By holding zv_state_lock here, we guarantee that no
* one is currently using this zv
*/
mutex_enter(&zv->zv_state_lock);
/* If in use, leave alone */
if (zv->zv_open_count > 0 ||
@ -1979,7 +2063,10 @@ zvol_remove_minors_impl(const char *name)
zvol_remove(zv);
/* clear this so zvol_open won't open it */
/*
* clear this while holding zvol_state_lock so
* zvol_open won't open it
*/
zv->zv_disk->private_data = NULL;
/* Drop zv_state_lock before zvol_free() */
@ -1992,6 +2079,8 @@ zvol_remove_minors_impl(const char *name)
list_insert_head(&free_list, zv);
else
tid = t;
} else {
mutex_exit(&zv->zv_state_lock);
}
}
mutex_exit(&zvol_state_lock);
@ -2025,12 +2114,12 @@ zvol_remove_minor_impl(const char *name)
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
mutex_enter(&zv->zv_state_lock);
if (strcmp(zv->zv_name, name) == 0) {
/*
* By taking zv_state_lock here, we guarantee that no
* By holding zv_state_lock here, we guarantee that no
* one is currently using this zv
*/
mutex_enter(&zv->zv_state_lock);
/* If in use, leave alone */
if (zv->zv_open_count > 0 ||
@ -2045,6 +2134,8 @@ zvol_remove_minor_impl(const char *name)
mutex_exit(&zv->zv_state_lock);
break;
} else {
mutex_exit(&zv->zv_state_lock);
}
}
@ -2077,9 +2168,13 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
mutex_enter(&zv->zv_state_lock);
/* If in use, leave alone */
if (zv->zv_open_count > 0)
if (zv->zv_open_count > 0) {
mutex_exit(&zv->zv_state_lock);
continue;
}
if (strcmp(zv->zv_name, oldname) == 0) {
zvol_rename_minor(zv, newname);
@ -2091,6 +2186,8 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
zv->zv_name + oldnamelen + 1);
zvol_rename_minor(zv, name);
}
mutex_exit(&zv->zv_state_lock);
}
mutex_exit(&zvol_state_lock);