Fix zvol_open() lock inversion

When restructuring the zvol_open() logic for the Linux 5.13 kernel
a lock inversion was accidentally introduced.  In the updated code
the spa_namespace_lock is now taken before the zv_suspend_lock
allowing the following scenario to occur:

    down_read <=== waiting for zv_suspend_lock
    zvol_open <=== holds spa_namespace_lock
    __blkdev_get
    blkdev_get_by_dev
    blkdev_open
    ...

     mutex_lock <== waiting for spa_namespace_lock
     spa_open_common
     spa_open
     dsl_pool_hold
     dmu_objset_hold_flags
     dmu_objset_hold
     dsl_prop_get
     dsl_prop_get_integer
     zvol_create_minor
     dmu_recv_end
     zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend()
     zfs_ioc_recv
     ...

This commit resolves the issue by moving the acquisition of the
spa_namespace_lock back to after the zv_suspend_lock which restores
the original ordering.

Additionally, as part of this change the error exit paths were
simplified where possible.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #12863
This commit is contained in:
Brian Behlendorf 2021-12-17 09:52:13 -08:00 committed by Tony Hutter
parent 4b2bac5fe9
commit 9ec630ff2c

View File

@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
{ {
zvol_state_t *zv; zvol_state_t *zv;
int error = 0; int error = 0;
boolean_t drop_suspend = B_TRUE; boolean_t drop_suspend = B_FALSE;
boolean_t drop_namespace = B_FALSE;
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
hrtime_t start = gethrtime(); hrtime_t start = gethrtime();
@ -517,7 +516,36 @@ retry:
return (SET_ERROR(-ENXIO)); return (SET_ERROR(-ENXIO));
} }
if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { mutex_enter(&zv->zv_state_lock);
/*
* Make sure zvol is not suspended during first open
* (hold zv_suspend_lock) and respect proper lock acquisition
* ordering - zv_suspend_lock before zv_state_lock
*/
if (zv->zv_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
} else {
drop_suspend = B_TRUE;
}
} else {
drop_suspend = B_TRUE;
}
}
rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
if (zv->zv_open_count == 0) {
boolean_t drop_namespace = B_FALSE;
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
/* /*
* In all other call paths the spa_namespace_lock is taken * In all other call paths the spa_namespace_lock is taken
* before the bdev->bd_mutex lock. However, on open(2) * before the bdev->bd_mutex lock. However, on open(2)
@ -542,84 +570,51 @@ retry:
* the kernel so the only option is to return the error for * the kernel so the only option is to return the error for
* the caller to handle it. * the caller to handle it.
*/ */
if (!mutex_tryenter(&spa_namespace_lock)) { if (!mutex_owned(&spa_namespace_lock)) {
rw_exit(&zvol_state_lock); if (!mutex_tryenter(&spa_namespace_lock)) {
mutex_exit(&zv->zv_state_lock);
rw_exit(&zv->zv_suspend_lock);
#ifdef HAVE_BLKDEV_GET_ERESTARTSYS #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
schedule(); schedule();
return (SET_ERROR(-ERESTARTSYS));
#else
if ((gethrtime() - start) > timeout)
return (SET_ERROR(-ERESTARTSYS)); return (SET_ERROR(-ERESTARTSYS));
#else
if ((gethrtime() - start) > timeout)
return (SET_ERROR(-ERESTARTSYS));
schedule_timeout(MSEC_TO_TICK(10)); schedule_timeout(MSEC_TO_TICK(10));
goto retry; goto retry;
#endif #endif
} else { } else {
drop_namespace = B_TRUE; drop_namespace = B_TRUE;
}
}
mutex_enter(&zv->zv_state_lock);
/*
* make sure zvol is not suspended during first open
* (hold zv_suspend_lock) and respect proper lock acquisition
* ordering - zv_suspend_lock before zv_state_lock
*/
if (zv->zv_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
drop_suspend = B_FALSE;
} }
} }
} else {
drop_suspend = B_FALSE;
}
rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
if (zv->zv_open_count == 0) {
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
if (error)
goto out_mutex; if (drop_namespace)
mutex_exit(&spa_namespace_lock);
} }
if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { if (error == 0) {
error = -EROFS; if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
goto out_open_count; if (zv->zv_open_count == 0)
} zvol_last_close(zv);
zv->zv_open_count++; error = SET_ERROR(-EROFS);
} else {
zv->zv_open_count++;
}
}
mutex_exit(&zv->zv_state_lock); mutex_exit(&zv->zv_state_lock);
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (drop_suspend) if (drop_suspend)
rw_exit(&zv->zv_suspend_lock); rw_exit(&zv->zv_suspend_lock);
zfs_check_media_change(bdev); if (error == 0)
zfs_check_media_change(bdev);
return (0); return (error);
out_open_count:
if (zv->zv_open_count == 0)
zvol_last_close(zv);
out_mutex:
mutex_exit(&zv->zv_state_lock);
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
return (SET_ERROR(error));
} }
static void static void