mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-22 02:27:36 +03:00
zvol: stop using zvol_state_lock to protect OS-side private data
zvol_state_lock is intended to protect access to the global name->zvol lists (zvol_find_by_name()), but has also been used to control access to OS-side private data, accessed through whatever kernel object is used to represent the volume (gendisk, geom, etc). This appears to have been necessary to some degree because the OS-side object is what's used to get a handle on zvol_state_t, so zv_state_lock and zv_suspend_lock can't be used to manage access, but also, with the private object and the zvol_state_t being shutdown and destroyed at the same time in zvol_os_free(), we must ensure that the private object pointer only ever corresponds to a real zvol_state_t, not one in partial destruction. Taking the global lock seems like a convenient way to ensure this. The problem with this is that zvol_state_lock does not actually protect access to the zvol_state_t internals, so we need to take zv_state_lock and/or zv_suspend_lock. If those are contended, this can then cause OS-side operations (eg zvol_open()) to sleep to wait for them while hold zvol_state_lock. This then blocks out all other OS-side operations which want to get the private data, and any ZFS-side control operations that would take the write half of the lock. It's even worse if ZFS-side operations induce OS-side calls back into the zvol (eg creating a zvol triggers a partition probe inside the kernel, and also a userspace access from udev to set up device links). And it gets even works again if anything decides to defer those ops to a task and wait on them, which zvol_remove_minors_impl() will do under high load. However, since the previous commit, we have a guarantee that the private data pointer will always be NULL'd out in zvol_os_remove_minor() _before_ the zvol_state_t is made invalid, but it won't happen until all users are ejected. So, if we make access to the private object pointer atomic, we remove the need to take a global lockout to access it, and so we can remove all acquisitions of zvol_state_lock from the OS side. While here, I've rewritten much of the locking theory comment at the top of zvol.c. It wasn't wrong, but it hadn't been followed exactly, so I've tried to describe the purpose of each lock in a little more detail, and in particular describe where it should and shouldn't be used. Sponsored-by: Klara, Inc. Sponsored-by: Railway Corporation Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Fedor Uporov <fuporov.vstack@gmail.com> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Closes #17625
This commit is contained in:
committed by
Brian Behlendorf
parent
96f9d271ea
commit
8a0e5e8b54
@@ -225,25 +225,14 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
|
||||
}
|
||||
|
||||
retry:
|
||||
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
|
||||
/*
|
||||
* Obtain a copy of private under zvol_state_lock to make sure either
|
||||
* the result of zvol free code setting private to NULL is observed,
|
||||
* or the zv is protected from being freed because of the positive
|
||||
* zv_open_count.
|
||||
*/
|
||||
zv = pp->private;
|
||||
if (zv == NULL) {
|
||||
rw_exit(&zvol_state_lock);
|
||||
err = SET_ERROR(ENXIO);
|
||||
goto out_locked;
|
||||
}
|
||||
zv = atomic_load_ptr(&pp->private);
|
||||
if (zv == NULL)
|
||||
return (SET_ERROR(ENXIO));
|
||||
|
||||
mutex_enter(&zv->zv_state_lock);
|
||||
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
|
||||
rw_exit(&zvol_state_lock);
|
||||
err = SET_ERROR(ENXIO);
|
||||
goto out_zv_locked;
|
||||
goto out_locked;
|
||||
}
|
||||
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
|
||||
|
||||
@@ -256,8 +245,24 @@ retry:
|
||||
drop_suspend = B_TRUE;
|
||||
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
|
||||
/*
|
||||
* Removal may happen while the locks are down, so
|
||||
* we can't trust zv any longer; we have to start over.
|
||||
*/
|
||||
zv = atomic_load_ptr(&pp->private);
|
||||
if (zv == NULL)
|
||||
return (SET_ERROR(ENXIO));
|
||||
|
||||
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
||||
mutex_enter(&zv->zv_state_lock);
|
||||
|
||||
if (zv->zv_zso->zso_dying ||
|
||||
zv->zv_flags & ZVOL_REMOVING) {
|
||||
err = SET_ERROR(ENXIO);
|
||||
goto out_locked;
|
||||
}
|
||||
|
||||
/* Check to see if zv_suspend_lock is needed. */
|
||||
if (zv->zv_open_count != 0) {
|
||||
rw_exit(&zv->zv_suspend_lock);
|
||||
@@ -265,7 +270,6 @@ retry:
|
||||
}
|
||||
}
|
||||
}
|
||||
rw_exit(&zvol_state_lock);
|
||||
|
||||
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
||||
|
||||
@@ -293,7 +297,7 @@ retry:
|
||||
if (drop_namespace)
|
||||
mutex_exit(&spa_namespace_lock);
|
||||
if (err)
|
||||
goto out_zv_locked;
|
||||
goto out_locked;
|
||||
pp->mediasize = zv->zv_volsize;
|
||||
pp->stripeoffset = 0;
|
||||
pp->stripesize = zv->zv_volblocksize;
|
||||
@@ -328,9 +332,8 @@ out_opened:
|
||||
zvol_last_close(zv);
|
||||
wakeup(zv);
|
||||
}
|
||||
out_zv_locked:
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
out_locked:
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
if (drop_suspend)
|
||||
rw_exit(&zv->zv_suspend_lock);
|
||||
return (err);
|
||||
@@ -344,12 +347,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
|
||||
boolean_t drop_suspend = B_TRUE;
|
||||
int new_open_count;
|
||||
|
||||
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
|
||||
zv = pp->private;
|
||||
if (zv == NULL) {
|
||||
rw_exit(&zvol_state_lock);
|
||||
zv = atomic_load_ptr(&pp->private);
|
||||
if (zv == NULL)
|
||||
return (SET_ERROR(ENXIO));
|
||||
}
|
||||
|
||||
mutex_enter(&zv->zv_state_lock);
|
||||
if (zv->zv_flags & ZVOL_EXCL) {
|
||||
@@ -376,6 +376,15 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
||||
mutex_enter(&zv->zv_state_lock);
|
||||
|
||||
/*
|
||||
* Unlike in zvol_geom_open(), we don't check if
|
||||
* removal started here, because we might be one of the
|
||||
* openers that needs to be thrown out! If we're the
|
||||
* last, we need to call zvol_last_close() below to
|
||||
* finish cleanup. So, no special treatment for us.
|
||||
*/
|
||||
|
||||
/* Check to see if zv_suspend_lock is needed. */
|
||||
new_open_count = zv->zv_open_count - count;
|
||||
if (new_open_count != 0) {
|
||||
@@ -386,7 +395,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
|
||||
} else {
|
||||
drop_suspend = B_FALSE;
|
||||
}
|
||||
rw_exit(&zvol_state_lock);
|
||||
|
||||
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
||||
|
||||
@@ -439,7 +447,7 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
|
||||
("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
|
||||
pp->name, acr, acw, ace));
|
||||
|
||||
if (pp->private == NULL) {
|
||||
if (atomic_load_ptr(&pp->private) == NULL) {
|
||||
if (acr <= 0 && acw <= 0 && ace <= 0)
|
||||
return (0);
|
||||
return (pp->error);
|
||||
@@ -906,25 +914,14 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
|
||||
boolean_t drop_suspend = B_FALSE;
|
||||
|
||||
retry:
|
||||
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
|
||||
/*
|
||||
* Obtain a copy of si_drv2 under zvol_state_lock to make sure either
|
||||
* the result of zvol free code setting si_drv2 to NULL is observed,
|
||||
* or the zv is protected from being freed because of the positive
|
||||
* zv_open_count.
|
||||
*/
|
||||
zv = dev->si_drv2;
|
||||
if (zv == NULL) {
|
||||
rw_exit(&zvol_state_lock);
|
||||
err = SET_ERROR(ENXIO);
|
||||
goto out_locked;
|
||||
}
|
||||
zv = atomic_load_ptr(&dev->si_drv2);
|
||||
if (zv == NULL)
|
||||
return (SET_ERROR(ENXIO));
|
||||
|
||||
mutex_enter(&zv->zv_state_lock);
|
||||
if (zv->zv_zso->zso_dying) {
|
||||
rw_exit(&zvol_state_lock);
|
||||
if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) {
|
||||
err = SET_ERROR(ENXIO);
|
||||
goto out_zv_locked;
|
||||
goto out_locked;
|
||||
}
|
||||
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
|
||||
|
||||
@@ -939,6 +936,13 @@ retry:
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
||||
mutex_enter(&zv->zv_state_lock);
|
||||
|
||||
if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
|
||||
/* Removal started while locks were down. */
|
||||
err = SET_ERROR(ENXIO);
|
||||
goto out_locked;
|
||||
}
|
||||
|
||||
/* Check to see if zv_suspend_lock is needed. */
|
||||
if (zv->zv_open_count != 0) {
|
||||
rw_exit(&zv->zv_suspend_lock);
|
||||
@@ -946,7 +950,6 @@ retry:
|
||||
}
|
||||
}
|
||||
}
|
||||
rw_exit(&zvol_state_lock);
|
||||
|
||||
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
||||
|
||||
@@ -974,7 +977,7 @@ retry:
|
||||
if (drop_namespace)
|
||||
mutex_exit(&spa_namespace_lock);
|
||||
if (err)
|
||||
goto out_zv_locked;
|
||||
goto out_locked;
|
||||
}
|
||||
|
||||
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
||||
@@ -1001,9 +1004,8 @@ out_opened:
|
||||
zvol_last_close(zv);
|
||||
wakeup(zv);
|
||||
}
|
||||
out_zv_locked:
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
out_locked:
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
if (drop_suspend)
|
||||
rw_exit(&zv->zv_suspend_lock);
|
||||
return (err);
|
||||
@@ -1015,12 +1017,9 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
|
||||
zvol_state_t *zv;
|
||||
boolean_t drop_suspend = B_TRUE;
|
||||
|
||||
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
|
||||
zv = dev->si_drv2;
|
||||
if (zv == NULL) {
|
||||
rw_exit(&zvol_state_lock);
|
||||
zv = atomic_load_ptr(&dev->si_drv2);
|
||||
if (zv == NULL)
|
||||
return (SET_ERROR(ENXIO));
|
||||
}
|
||||
|
||||
mutex_enter(&zv->zv_state_lock);
|
||||
if (zv->zv_flags & ZVOL_EXCL) {
|
||||
@@ -1045,6 +1044,15 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
||||
mutex_enter(&zv->zv_state_lock);
|
||||
|
||||
/*
|
||||
* Unlike in zvol_cdev_open(), we don't check if
|
||||
* removal started here, because we might be one of the
|
||||
* openers that needs to be thrown out! If we're the
|
||||
* last, we need to call zvol_last_close() below to
|
||||
* finish cleanup. So, no special treatment for us.
|
||||
*/
|
||||
|
||||
/* Check to see if zv_suspend_lock is needed. */
|
||||
if (zv->zv_open_count != 1) {
|
||||
rw_exit(&zv->zv_suspend_lock);
|
||||
@@ -1054,7 +1062,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
|
||||
} else {
|
||||
drop_suspend = B_FALSE;
|
||||
}
|
||||
rw_exit(&zvol_state_lock);
|
||||
|
||||
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
|
||||
|
||||
@@ -1086,7 +1093,8 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
|
||||
int error;
|
||||
boolean_t sync;
|
||||
|
||||
zv = dev->si_drv2;
|
||||
zv = atomic_load_ptr(&dev->si_drv2);
|
||||
ASSERT3P(zv, !=, NULL);
|
||||
|
||||
error = 0;
|
||||
KASSERT(zv->zv_open_count > 0,
|
||||
@@ -1147,6 +1155,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
|
||||
*(off_t *)data = 0;
|
||||
break;
|
||||
case DIOCGATTR: {
|
||||
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
||||
spa_t *spa = dmu_objset_spa(zv->zv_objset);
|
||||
struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
|
||||
uint64_t refd, avail, usedobjs, availobjs;
|
||||
@@ -1171,6 +1180,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
|
||||
arg->value.off = refd / DEV_BSIZE;
|
||||
} else
|
||||
error = SET_ERROR(ENOIOCTL);
|
||||
rw_exit(&zv->zv_suspend_lock);
|
||||
break;
|
||||
}
|
||||
case FIOSEEKHOLE:
|
||||
@@ -1181,10 +1191,12 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
|
||||
|
||||
hole = (cmd == FIOSEEKHOLE);
|
||||
noff = *off;
|
||||
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
|
||||
lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX,
|
||||
RL_READER);
|
||||
error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
|
||||
zfs_rangelock_exit(lr);
|
||||
rw_exit(&zv->zv_suspend_lock);
|
||||
*off = noff;
|
||||
break;
|
||||
}
|
||||
@@ -1398,7 +1410,7 @@ zvol_os_remove_minor(zvol_state_t *zv)
|
||||
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
|
||||
struct zvol_state_geom *zsg = &zso->zso_geom;
|
||||
struct g_provider *pp = zsg->zsg_provider;
|
||||
pp->private = NULL;
|
||||
atomic_store_ptr(&pp->private, NULL);
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
|
||||
g_topology_lock();
|
||||
@@ -1409,7 +1421,7 @@ zvol_os_remove_minor(zvol_state_t *zv)
|
||||
struct cdev *dev = zsd->zsd_cdev;
|
||||
|
||||
if (dev != NULL)
|
||||
dev->si_drv2 = NULL;
|
||||
atomic_store_ptr(&dev->si_drv2, NULL);
|
||||
mutex_exit(&zv->zv_state_lock);
|
||||
|
||||
if (dev != NULL) {
|
||||
|
||||
Reference in New Issue
Block a user